diff --git a/.github/workflows/c910v.yml b/.github/workflows/c910v.yml index 30cf32b34..68ba2ddd7 100644 --- a/.github/workflows/c910v.yml +++ b/.github/workflows/c910v.yml @@ -14,8 +14,8 @@ jobs: if: "github.repository == 'OpenMathLib/OpenBLAS'" runs-on: ubuntu-latest env: - xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1663142514282 - toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1-20220906.tar.gz + xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1698113812618 + toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.8.0-20231018.tar.gz strategy: fail-fast: false matrix: @@ -76,7 +76,7 @@ jobs: run: | wget ${xuetie_toolchain}/${toolchain_file_name} tar -xvf ${toolchain_file_name} -C /opt - export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1/bin:$PATH" + export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.8.0/bin:$PATH" make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc) diff --git a/.github/workflows/loongarch64.yml b/.github/workflows/loongarch64.yml index 4a9bf98b6..f1bf8064c 100644 --- a/.github/workflows/loongarch64.yml +++ b/.github/workflows/loongarch64.yml @@ -16,13 +16,13 @@ jobs: include: - target: LOONGSONGENERIC triple: loongarch64-unknown-linux-gnu - opts: NO_SHARED=1 TARGET=LOONGSONGENERIC + opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSONGENERIC - target: LOONGSON3R5 triple: loongarch64-unknown-linux-gnu - opts: NO_SHARED=1 TARGET=LOONGSON3R5 + opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON3R5 - target: LOONGSON2K1000 triple: loongarch64-unknown-linux-gnu - opts: NO_SHARED=1 TARGET=LOONGSON2K1000 + opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000 - target: DYNAMIC_ARCH triple: loongarch64-unknown-linux-gnu opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC @@ -40,8 +40,9 @@ jobs: - name: Download and install loongarch64-toolchain run: | - wget https://github.com/loongson/build-tools/releases/download/2022.09.06/loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz - tar -xf loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz -C /opt + wget https://github.com/sunhaiyong1978/CLFS-for-LoongArch/releases/download/8.1/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz + #wget https://github.com/loongson/build-tools/releases/download/2023.08.08/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz + tar -xf CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz -C /opt - name: Set env run: | diff --git a/CMakeLists.txt b/CMakeLists.txt index 7203088a0..5a1e4b271 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 25.dev) +set(OpenBLAS_PATCH_VERSION 26.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") @@ -256,15 +256,15 @@ if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20) set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) set (CMAKE_Fortran_CREATE_SHARED_LIBRARY "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' " - "sh -c '${CMAKE_AR} -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " + "sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '" - "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -undefined dynamic_lookup -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'" + "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'" "sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'") else () set (CMAKE_C_CREATE_SHARED_LIBRARY "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' " - "sh -c '${CMAKE_AR} -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " - "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -undefined dynamic_lookup -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'") + "sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " + "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'") endif () endif() diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 8f7abc5f8..203320826 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -220,4 +220,6 @@ In chronological order: * Mark Seminatore * [2023-11-09] Improve Windows threading performance scaling * [2024-02-09] Introduce MT_TRACE facility and improve code consistency - \ No newline at end of file + +* Dirreke + * [2024-01-16] Add basic support for the CSKY architecture diff --git a/Changelog.txt b/Changelog.txt index e0fe0ca5a..b6139d6b7 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,49 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.26 + 2-Jan-2024 + +general: +- improved the version of openblas.pc that is created by the CMAKE build +- fixed a CMAKE-specific build problem on older versions of MacOS +- worked around linking problems on old versions of MacOS +- corrected installation location of the lapacke_mangling header in CMAKE builds +- added type declarations for complex variables to the MSVC-specific parts of the LAPACK header +- significantly sped up ?GESV for small problem sizes by introducing a lower bound for multithreading +- imported additions and corrections from the Reference-LAPACK project: + - added new LAPACK functions for truncated QR with pivoting (Reference-LAPACK PRs 891&941) + - handle miscalculation of minimum work array size in corner cases (Reference-LAPACK PR 942) + - fixed use of uninitialized variables in ?GEDMD and improved inline documentation (PR 959) + - fixed use of uninitialized variables (and consequential failures) in ?BBCSD (PR 967) + - added tests for the recently introduced Dynamic Mode Decomposition functions (PR 736) + - fixed several memory leaks in the LAPACK testsuite (PR 953) + - fixed counting of testsuite results by the Python script (PR 954) + +x86-64: +- fixed computation of CASUM on SkylakeX and newer targets in the special + case that AVX512 is not supported by the compiler or operating environment +- fixed potential undefined behaviour in the CASUM/ZASUM kernels for AVX512 targets +- worked around a problem in the pre-AVX kernels for GEMV +- sped up the thread management code on MS Windows + +arm64: +- fixed building of the LAPACK testsuite with Xcode 15 on Apple M1 and newer +- sped up the thread management code on MS Windows +- sped up SGEMM and DGEMM on Neoverse V1 and N1 +- sped up ?DOT on SVE-capable targets +- reduced the number of targets in DYNAMIC_ARCH builds by eliminating functionally equivalent ones +- included support for Apple M1 and newer targets in DYNAMIC_ARCH builds + +power: +- improved the SGEMM kernel for POWER10 +- fixed compilation with (very) old versions of gcc +- fixed detection of old 32bit PPC targets in CMAKE-based builds +- added autodetection of the POWERPC 7400 subtype +- fixed CMAKE-based compilation for PPCG4 and PPC970 targets + +loongarch64: +- added and improved optimized kernels for almost all BLAS functions + ==================================================================== Version 0.3.25 12-Nov-2023 diff --git a/Makefile.arm64 b/Makefile.arm64 index 1b10446f7..ca053b03d 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -104,19 +104,25 @@ ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 endif else -CCOMMON_OPT += -march=armv8.4-a+sve -mtune=native +CCOMMON_OPT += -march=armv8.4-a+sve +ifneq ($(CROSS), 1) +CCOMMON_OPT += -mtune=native +endif ifneq ($(F_COMPILER), NAG) -FCOMMON_OPT += -march=armv8.4-a -mtune=native +FCOMMON_OPT += -march=armv8.4-a +ifneq ($(CROSS), 1) +FCOMMON_OPT += -mtune=native +endif endif endif else -CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72 ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 endif endif else -CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +CCOMMON_OPT += -march=armv8-a+sve -mtune=cortex-a72 ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 endif @@ -132,25 +138,31 @@ ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG))) ifneq ($(OSNAME), Darwin) CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 else -CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72 endif ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 endif else -CCOMMON_OPT += -march=armv8.5-a+sve -mtune=native +CCOMMON_OPT += -march=armv8.5-a+sve +ifneq ($(CROSS), 1) +CCOMMON_OPT += -mtune=native +endif ifneq ($(F_COMPILER), NAG) -FCOMMON_OPT += -march=armv8.5-a -mtune=native +FCOMMON_OPT += -march=armv8.5-a +ifneq ($(CROSS), 1) +FCOMMON_OPT += -mtune=native +endif endif endif else -CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72 ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 endif endif else -CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +CCOMMON_OPT += -march=armv8-a+sve -mtune=cortex-a72 ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 endif diff --git a/Makefile.csky b/Makefile.csky new file mode 100644 index 000000000..36162af2a --- /dev/null +++ b/Makefile.csky @@ -0,0 +1,4 @@ +ifeq ($(CORE), CK860FV) +CCOMMON_OPT += -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float +FCOMMON_OPT += -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float -static +endif diff --git a/Makefile.prebuild b/Makefile.prebuild index 0be4f1274..b7d695a75 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -55,6 +55,26 @@ ifeq ($(TARGET), C910V) TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d endif +ifeq ($(TARGET), CK860FV) +TARGET_FLAGS = -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float +endif + +ifeq ($(TARGET), x280) +TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d +endif + +ifeq ($(TARGET), RISCV64_ZVL256B) +TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d +endif + +ifeq ($(TARGET), RISCV64_ZVL128B) +TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d +endif + +ifeq ($(TARGET), RISCV64_GENERIC) +TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d +endif + all: getarch_2nd ./getarch_2nd 0 >> $(TARGET_MAKE) ./getarch_2nd 1 >> $(TARGET_CONF) diff --git a/Makefile.riscv64 b/Makefile.riscv64 index ce91e03ec..113cc57c5 100644 --- a/Makefile.riscv64 +++ b/Makefile.riscv64 @@ -2,3 +2,19 @@ ifeq ($(CORE), C910V) CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static endif +ifeq ($(CORE), x280) +CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math +FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static +endif +ifeq ($(CORE), RISCV64_ZVL256B) +CCOMMON_OPT += -march=rv64imafdcv_zvl256b -mabi=lp64d +FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static +endif +ifeq ($(CORE), RISCV64_ZVL128B) +CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d +FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static +endif +ifeq ($(CORE), RISCV64_GENERIC) +CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d +FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static +endif diff --git a/Makefile.rule b/Makefile.rule index 58f02358e..8dbf5eab6 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.25.dev +VERSION = 0.3.26.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library diff --git a/Makefile.system b/Makefile.system index ff06e503c..0088eaff5 100644 --- a/Makefile.system +++ b/Makefile.system @@ -677,16 +677,12 @@ ifeq ($(ARCH), arm64) DYNAMIC_CORE = ARMV8 DYNAMIC_CORE += CORTEXA53 DYNAMIC_CORE += CORTEXA57 -DYNAMIC_CORE += CORTEXA72 -DYNAMIC_CORE += CORTEXA73 DYNAMIC_CORE += NEOVERSEN1 ifneq ($(NO_SVE), 1) DYNAMIC_CORE += NEOVERSEV1 DYNAMIC_CORE += NEOVERSEN2 DYNAMIC_CORE += ARMV8SVE endif -DYNAMIC_CORE += CORTEXA55 -DYNAMIC_CORE += FALKOR DYNAMIC_CORE += THUNDERX DYNAMIC_CORE += THUNDERX2T99 DYNAMIC_CORE += TSV110 @@ -877,6 +873,11 @@ endif endif endif +ifeq ($(ARCH), csky) +NO_BINARY_MODE = 1 +BINARY_DEFINED = 1 +endif + # # C Compiler dependent settings # diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 702447ace..c0dbe84c8 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -130,11 +130,11 @@ ifeq ($(C_COMPILER), GCC) endif endif else ifeq ($(C_COMPILER), CLANG) - # cooperlake support was added in clang 12 + # sapphire rapids support was added in clang 12 ifeq ($(CLANGVERSIONGTEQ12), 1) - CCOMMON_OPT += -march=cooperlake + CCOMMON_OPT += -march=sapphirerapids ifneq ($(F_COMPILER), NAG) - FCOMMON_OPT += -march=cooperlake + FCOMMON_OPT += -march=sapphirerapids endif else # not supported in clang, fallback to avx512 CCOMMON_OPT += -march=skylake-avx512 diff --git a/README.md b/README.md index aaadd0d9c..2f0a0da4c 100644 --- a/README.md +++ b/README.md @@ -196,7 +196,12 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th ```sh make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran ``` - (also known to work on C906) + (also known to work on C906 as long as you use only single-precision functions - its instruction set support appears to be incomplete in double precision) + +- **x280**: Level-3 BLAS and Level-1,2 are optimized by RISC-V Vector extension 1.0. + ```sh + make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran + ``` ### Support for multiple targets in a single library @@ -207,9 +212,11 @@ For **x86_64**, the list of targets this activates contains Prescott, Core2, Neh `DYNAMIC_ARCH` is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias, Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano. -On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus. +On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus. If compiler support for SVE is available at build time, support for NeoverseN2, NeoverseV1 as well as generic ArmV8SVE targets is also enabled. + +For **POWER**, the list encompasses POWER6, POWER8 and POWER9. POWER10 is additionally available if a sufficiently recent compiler is used for the build. -For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14. +on **ZARCH** it comprises Z13 and Z14 as well as generic zarch support. The `TARGET` option can be used in conjunction with `DYNAMIC_ARCH=1` to specify which cpu model should be assumed for all the common code in the library, usually you will want to set this to the oldest model you expect to encounter. diff --git a/TargetList.txt b/TargetList.txt index deef75819..115030c1b 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -118,8 +118,11 @@ Z13 Z14 10.RISC-V 64: -RISCV64_GENERIC +RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54) +RISCV64_ZVL128B C910V +x280 +RISCV64_ZVL256B 11.LOONGARCH64: LOONGSONGENERIC @@ -133,3 +136,7 @@ E2K EV4 EV5 EV6 + +14.CSKY +CSKY +CK860FV diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 317bc504a..3ae8615a7 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -288,9 +288,9 @@ jobs: vmImage: 'ubuntu-latest' steps: - script: | - wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.2/alpine-chroot-install \ - && echo '60c7e0b5d82e21d1a549fc9a46ba3b36688c09dc alpine-chroot-install' | sha1sum -c \ - || exit 1 + wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.14.0/alpine-chroot-install \ + && echo 'ccbf65f85cdc351851f8ad025bb3e65bae4d5b06 alpine-chroot-install' | sha1sum -c \ + || exit 1 alpine() { /alpine/enter-chroot -u "$USER" "$@"; } sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' alpine make DYNAMIC_ARCH=1 BINARY=64 diff --git a/benchmark/Makefile b/benchmark/Makefile index d9ddb9042..b7493950a 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -37,6 +37,12 @@ ESSL=/opt/ibm/lib #LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a +# x280 temporary workaround for gfortran +ifeq ($(TARGET), x280) +CCOMMON_OPT:=$(filter-out -mllvm --riscv-v-vector-bits-min=512,$(CCOMMON_OPT)) +endif + + ifneq ($(NO_LAPACK), 1) GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ @@ -3436,4 +3442,4 @@ smallscaling: smallscaling.c ../$(LIBNAME) clean :: @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling -include $(TOPDIR)/Makefile.tail +include $(TOPDIR)/Makefile.tail \ No newline at end of file diff --git a/c_check b/c_check index b5e4a9ad0..59ab9bb13 100755 --- a/c_check +++ b/c_check @@ -91,6 +91,7 @@ case "$data" in *ARCH_ZARCH*) architecture=zarch ;; *ARCH_RISCV64*) architecture=riscv64 ;; *ARCH_LOONGARCH64*) architecture=loongarch64 ;; + *ARCH_CSKY*) architecture=csky ;; esac defined=0 @@ -236,6 +237,7 @@ case "$data" in *ARCH_ARM*) architecture=arm ;; *ARCH_ZARCH*) architecture=zarch ;; *ARCH_LOONGARCH64*) architecture=loongarch64 ;; + *ARCH_CSKY*) architecture=csky ;; esac binformat='bin32' @@ -244,6 +246,7 @@ case "$data" in esac no_avx512=0 +no_avx512bf=0 if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') tmpf="$tmpd/a.c" @@ -262,6 +265,25 @@ if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then } rm -rf "$tmpd" + if [ "$no_avx512" -eq 0 ]; then + tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') + tmpf="$tmpd/a.c" + code='"__m512 a= _mm512_dpbf16_ps(a, (__m512bh) _mm512_loadu_si512(%1]), (__m512bh) _mm512_loadu_si512(%2]));"' + printf "#include \n\nint main(void){ %s; }\n" "$code" >> "$tmpf" + if [ "$compiler" = "PGI" ]; then + args=" -tp cooperlake -c -o $tmpf.o $tmpf" + else + args=" -march=cooperlake -c -o $tmpf.o $tmpf" + fi + no_avx512bf=0 + { + $compiler_name $flags $args >/dev/null 2>&1 + } || { + no_avx512bf=1 + } + + rm -rf "$tmpd" + fi fi no_rv64gv=0 @@ -409,6 +431,7 @@ done [ "$makefile" = "-" ] && { [ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n" [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" + [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n" [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n" [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n" exit 0 @@ -437,6 +460,7 @@ done [ "$no_sve" -eq 1 ] && printf "NO_SVE=1\n" [ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n" [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" + [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n" [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n" [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n" [ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n" diff --git a/c_check.pl b/c_check.pl index d9c36793c..6b89f06eb 100644 --- a/c_check.pl +++ b/c_check.pl @@ -97,6 +97,7 @@ $architecture = arm64 if ($data =~ /ARCH_ARM64/); $architecture = zarch if ($data =~ /ARCH_ZARCH/); $architecture = riscv64 if ($data =~ /ARCH_RISCV64/); $architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); +$architecture = csky if ($data =~ /ARCH_CSKY/); $defined = 0; @@ -156,6 +157,11 @@ if ($architecture eq "loongarch64") { $binary = 64; } +if ($architecture eq "csky") { + $defined = 1; + $binary = 32; +} + if ($compiler eq "PGI") { $compiler_name .= " -tp p7" if ($binary eq "32"); $compiler_name .= " -tp p7-64" if ($binary eq "64"); @@ -284,6 +290,7 @@ $architecture = arm if ($data =~ /ARCH_ARM/); $architecture = arm64 if ($data =~ /ARCH_ARM64/); $architecture = zarch if ($data =~ /ARCH_ZARCH/); $architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); +$architecture = csky if ($data =~ /ARCH_CSKY/); $binformat = bin32; $binformat = bin64 if ($data =~ /BINARY_64/); diff --git a/cblas.h b/cblas.h index 8a5055cf8..beaa32cc2 100644 --- a/cblas.h +++ b/cblas.h @@ -12,6 +12,7 @@ extern "C" { /*Set the number of threads on runtime.*/ void openblas_set_num_threads(int num_threads); void goto_set_num_threads(int num_threads); +int openblas_set_num_threads_local(int num_threads); /*Get the number of threads on runtime.*/ int openblas_get_num_threads(void); @@ -100,6 +101,16 @@ CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); +float cblas_samax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); +double cblas_damax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); +float cblas_scamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); +double cblas_dzamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); + +float cblas_samin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); +double cblas_damin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); +float cblas_scamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); +double cblas_dzamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); + CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); @@ -115,6 +126,9 @@ void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); void cblas_zaxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); +void cblas_caxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); +void cblas_zaxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); + void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); void cblas_dcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); void cblas_ccopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); @@ -289,6 +303,14 @@ void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLA void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); +void cblas_sgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, + OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); +void cblas_dgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, + OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc); +void cblas_cgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, + OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); +void cblas_zgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, + OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); diff --git a/cmake/arch.cmake b/cmake/arch.cmake index ebdc5a833..eb974456b 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -44,7 +44,7 @@ endif () if (DYNAMIC_ARCH) if (ARM64) - set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) + set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99) set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE) endif () diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 7b4ef8947..2da941afb 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -36,9 +36,19 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS if (LOONGARCH64) if (BINARY64) - set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64") + CHECK_CXX_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI) + if(COMPILER_SUPPORT_LP64D_ABI) + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64d") + else() + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64") + endif () else () - set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32") + CHECK_CXX_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI) + if(COMPILER_SUPPORT_ILP32D_ABI) + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=ilp32d") + else() + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32") + endif () endif () set(BINARY_DEFINED 1) endif () @@ -282,6 +292,27 @@ if (${CORE} STREQUAL POWER8) endif () endif () +# With -mcpu=970 added it compiles, but library is broken, at least on macOS. If someone +# tests on *BSD or Linux and adds this flag, please make sure it is not used for macOS case. +if (${CORE} STREQUAL PPC970) + if (NOT DYNAMIC_ARCH) + set (CCOMMON_OPT "${CCOMMON_OPT} -mtune=970 -maltivec -fno-fast-math") + endif () + if (APPLE) + set (CCOMMON_OPT "${CCOMMON_OPT} -force_cpusubtype_ALL") + endif () +endif () + +# -mcpu=G4 seems to work fine, but perhaps avoid it for the sake of consistency? +if (${CORE} STREQUAL PPCG4) + if (NOT DYNAMIC_ARCH) + set (CCOMMON_OPT "${CCOMMON_OPT} -mtune=G4 -maltivec -fno-fast-math") + endif () + if (APPLE) + set (CCOMMON_OPT "${CCOMMON_OPT} -force_cpusubtype_ALL") + endif () +endif () + if (NOT DYNAMIC_ARCH) if (HAVE_AVX2) set (CCOMMON_OPT "${CCOMMON_OPT} -mavx2") diff --git a/cmake/fc.cmake b/cmake/fc.cmake index c496f6368..5c30be843 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -61,9 +61,19 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F endif () if (LOONGARCH64) if (BINARY64) - set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64") + CHECK_CXX_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI) + if(COMPILER_SUPPORT_LP64D_ABI) + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64d") + else() + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64") + endif () else () - set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32") + CHECK_CXX_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI) + if(COMPILER_SUPPORT_ILP32D_ABI) + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=ilp32d") + else() + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32") + endif () endif () endif () if (RISCV64) diff --git a/cmake/openblas.pc.in b/cmake/openblas.pc.in index 7e120af86..11e5606e5 100644 --- a/cmake/openblas.pc.in +++ b/cmake/openblas.pc.in @@ -5,7 +5,7 @@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ openblas_config=USE_64BITINT=@INTERFACE64@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ Name: OpenBLAS Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version -Version: @OPENBLAS_VERSION@ -URL: https://github.com/xianyi/OpenBLAS +Version: @OpenBLAS_VERSION@ +URL: https://github.com/OpenMathLib/OpenBLAS Libs: @OpenMP_C_FLAGS@ -L${libdir} -lopenblas${libsuffix} Cflags: -I${includedir} diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index c9671b379..e94497a04 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -38,7 +38,7 @@ if(CMAKE_CL_64 OR MINGW64) endif() elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING)) set(X86 1) -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*" OR (CMAKE_SYSTEM_NAME MATCHES "Darwin" AND CMAKE_OSX_ARCHITECTURES MATCHES "ppc.*")) set(POWER 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") set(MIPS64 1) @@ -109,7 +109,7 @@ else() endif () if (NOT BINARY) - if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64 OR RISCV64) + if (X86_64 OR ARM64 OR MIPS64 OR LOONGARCH64 OR RISCV64 OR (POWER AND NOT (CMAKE_OSX_ARCHITECTURES STREQUAL "ppc"))) set(BINARY 64) else () set(BINARY 32) diff --git a/common.h b/common.h index 4074df069..1f0b9e533 100644 --- a/common.h +++ b/common.h @@ -396,7 +396,7 @@ typedef int blasint; #endif /*** -To alloc job_t on heap or statck. +To alloc job_t on heap or stack. please https://github.com/xianyi/OpenBLAS/issues/246 ***/ #if defined(OS_WINDOWS) @@ -482,6 +482,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_e2k.h" #endif +#ifdef ARCH_CSKY +#include "common_csky.h" +#endif + #ifndef ASSEMBLER #ifdef OS_WINDOWSSTORE typedef char env_var_t[MAX_PATH]; diff --git a/common_csky.h b/common_csky.h new file mode 100644 index 000000000..3095dc781 --- /dev/null +++ b/common_csky.h @@ -0,0 +1,56 @@ +/***************************************************************************** +Copyright (c) 2011-2015, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#ifndef COMMON_CSKY +#define COMMON_CSKY + +#define MB __sync_synchronize() +#define WMB __sync_synchronize() +#define RMB __sync_synchronize() + +#define INLINE inline + +#ifndef ASSEMBLER + + +static inline int blas_quickdivide(blasint x, blasint y){ + return x / y; +} + +#endif + + + +#define BUFFER_SIZE ( 32 << 20) +#define SEEK_ADDRESS + +#endif \ No newline at end of file diff --git a/common_interface.h b/common_interface.h index 318827920..5a2e1654c 100644 --- a/common_interface.h +++ b/common_interface.h @@ -498,6 +498,15 @@ void BLASFUNC(zgemm3m)(char *, char *, blasint *, blasint *, blasint *, double * void BLASFUNC(xgemm3m)(char *, char *, blasint *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); +void BLASFUNC(sgemmt)(char*, char *, char *, blasint *, blasint *, float *, + float *, blasint *, float *, blasint *, float *, float *, blasint *); +void BLASFUNC(dgemmt)(char*, char *, char *, blasint *, blasint *, double *, + double *, blasint *, double *, blasint *, double *, double *, blasint *); +void BLASFUNC(cgemmt)(char*, char *, char *, blasint *, blasint *, float *, + float *, blasint *, float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zgemmt)(char*, char *, char *, blasint *, blasint *, double *, + double *, blasint *, double *, blasint *, double *, double *, blasint *); + int BLASFUNC(sge2mm)(char *, char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); @@ -764,8 +773,8 @@ xdouble BLASFUNC(qlamc3)(xdouble *, xdouble *); void BLASFUNC(saxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(daxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *); -void BLASFUNC(caxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *); -void BLASFUNC(zaxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *); +void BLASFUNC(caxpby) (blasint *, void *, float *, blasint *, void *, float *, blasint *); +void BLASFUNC(zaxpby) (blasint *, void *, double *, blasint *, void *, double *, blasint *); void BLASFUNC(somatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *); void BLASFUNC(domatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *); diff --git a/common_loongarch64.h b/common_loongarch64.h index 4963b2f07..b1426da79 100644 --- a/common_loongarch64.h +++ b/common_loongarch64.h @@ -119,19 +119,47 @@ static inline int WhereAmI(void){ #define MOV fmov.d #define CMOVT fsel #define MTC movgr2fr.d +#define MTG movfr2gr.d #define FABS fabs.d +#define FMIN fmin.d +#define FMINA fmina.d +#define FMAX fmax.d +#define FMAXA fmaxa.d #define CMPEQ fcmp.ceq.d #define CMPLE fcmp.cle.d #define CMPLT fcmp.clt.d #define NEG fneg.d +#define FFINT ffint.d.l #define XVFSUB xvfsub.d #define XVFADD xvfadd.d +#define XVFMUL xvfmul.d #define XVFMADD xvfmadd.d +#define XVFMIN xvfmin.d +#define XVFMINA xvfmina.d +#define XVFMAX xvfmax.d +#define XVFMAXA xvfmaxa.d +#define XVCMPEQ xvfcmp.ceq.d +#define XVCMPLE xvfcmp.cle.d +#define XVCMPLT xvfcmp.clt.d +#define XVMUL xvfmul.d +#define XVMSUB xvfmsub.d +#define XVNMSUB xvfnmsub.d #define VFSUB vfsub.d #define VFADD vfadd.d +#define VFMUL vfmul.d #define VFMADD vfmadd.d +#define VFMIN vfmin.d +#define VFMINA vfmina.d +#define VFMAX vfmax.d +#define VFMAXA vfmaxa.d +#define VCMPEQ vfcmp.ceq.d +#define VCMPLE vfcmp.cle.d +#define VCMPLT vfcmp.clt.d +#define VMUL vfmul.d +#define VMSUB vfmsub.d +#define VNMSUB vfnmsub.d #else @@ -147,19 +175,47 @@ static inline int WhereAmI(void){ #define MOV fmov.s #define CMOVT fsel #define MTC movgr2fr.w +#define MTG movfr2gr.s #define FABS fabs.s +#define FMIN fmin.s +#define FMINA fmina.s +#define FMAX fmax.s +#define FMAXA fmaxa.s #define CMPEQ fcmp.ceq.s #define CMPLE fcmp.cle.s #define CMPLT fcmp.clt.s #define NEG fneg.s +#define FFINT ffint.s.l #define XVFSUB xvfsub.s #define XVFADD xvfadd.s +#define XVFMUL xvfmul.s #define XVFMADD xvfmadd.s +#define XVFMIN xvfmin.s +#define XVFMINA xvfmina.s +#define XVFMAX xvfmax.s +#define XVFMAXA xvfmaxa.s +#define XVCMPEQ xvfcmp.ceq.s +#define XVCMPLE xvfcmp.cle.s +#define XVCMPLT xvfcmp.clt.s +#define XVMUL xvfmul.s +#define XVMSUB xvfmsub.s +#define XVNMSUB xvfnmsub.s #define VFSUB vfsub.s #define VFADD vfadd.s +#define VFMUL vfmul.s #define VFMADD vfmadd.s +#define VFMIN vfmin.s +#define VFMINA vfmina.s +#define VFMAX vfmax.s +#define VFMAXA vfmaxa.s +#define VCMPEQ vfcmp.ceq.s +#define VCMPLE vfcmp.cle.s +#define VCMPLT vfcmp.clt.s +#define VMUL vfmul.s +#define VMSUB vfmsub.s +#define VNMSUB vfnmsub.s #endif /* defined(DOUBLE) */ diff --git a/common_riscv64.h b/common_riscv64.h index 7ddbe80a4..ab3bfa25a 100644 --- a/common_riscv64.h +++ b/common_riscv64.h @@ -91,8 +91,26 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define BUFFER_SIZE ( 32 << 20) #define SEEK_ADDRESS -#if defined(C910V) -#include +#if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) || defined(RISCV64_ZVL128B) || defined(x280) +# include +#endif + +#if defined( __riscv_xtheadc ) && defined( __riscv_v ) && ( __riscv_v <= 7000 ) +// t-head toolchain uses obsolete rvv intrinsics, can't build for C910V without this +#define RISCV_0p10_INTRINSICS +#define RISCV_RVV(x) x +#else +#define RISCV_RVV(x) __riscv_ ## x +#endif + +#if defined(C910V) || defined(RISCV64_ZVL256B) +# if !defined(DOUBLE) +# define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f32m1_f32)(v) +# else +# define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f64m1_f64)(v) +# endif +#else +# define EXTRACT_FLOAT(v) (v[0]) #endif #endif diff --git a/common_thread.h b/common_thread.h index 9e7dae74a..d37fcb189 100644 --- a/common_thread.h +++ b/common_thread.h @@ -137,19 +137,20 @@ typedef struct blas_queue { extern int blas_server_avail; extern int blas_omp_number_max; +extern int blas_omp_threads_local; static __inline int num_cpu_avail(int level) { #ifdef USE_OPENMP int openmp_nthreads; openmp_nthreads=omp_get_max_threads(); + if (omp_in_parallel()) openmp_nthreads = blas_omp_threads_local; #endif #ifndef USE_OPENMP if (blas_cpu_number == 1 -#endif -#ifdef USE_OPENMP - if (openmp_nthreads == 1 || omp_in_parallel() +#else + if (openmp_nthreads == 1 #endif ) return 1; diff --git a/cpuid_power.c b/cpuid_power.c index 2526e8d0e..1ced8930a 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -160,6 +160,7 @@ int detect(void){ infoCount = HOST_BASIC_INFO_COUNT; host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&hostInfo, &infoCount); + if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_7400) return CPUTYPE_PPCG4; if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_7450) return CPUTYPE_PPCG4; if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_970) return CPUTYPE_PPC970; diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c index 894d2b873..ff7ba2aad 100644 --- a/cpuid_riscv64.c +++ b/cpuid_riscv64.c @@ -70,12 +70,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#define CPU_GENERIC 0 -#define CPU_C910V 1 +#define CPU_GENERIC 0 +#define CPU_C910V 1 +#define CPU_x280 2 +#define CPU_RISCV64_ZVL256B 3 +#define CPU_RISCV64_ZVL128B 4 static char *cpuname[] = { "RISCV64_GENERIC", - "C910V" + "C910V", + "x280", + "CPU_RISCV64_ZVL256B", + "CPU_RISCV64_ZVL128B" +}; + +static char *cpuname_lower[] = { + "riscv64_generic", + "c910v", + "x280", + "riscv64_zvl256b", + "riscv64_zvl128b" }; int detect(void){ @@ -86,23 +100,29 @@ int detect(void){ char *pmodel = NULL, *pisa = NULL; infile = fopen("/proc/cpuinfo", "r"); + if (!infile) + return CPU_GENERIC; while (fgets(buffer, sizeof(buffer), infile)){ if(!strncmp(buffer, "model name", 10)){ strcpy(model_buffer, buffer); - pmodel = strchr(isa_buffer, ':') + 1; + pmodel = strchr(model_buffer, ':'); + if (pmodel) + pmodel++; } if(!strncmp(buffer, "isa", 3)){ strcpy(isa_buffer, buffer); - pisa = strchr(isa_buffer, '4') + 1; + pisa = strchr(isa_buffer, '4'); + if (pisa) + pisa++; } } fclose(infile); - if (!pmodel) + if (!pmodel || !pisa) return(CPU_GENERIC); - + if (strstr(pmodel, check_c910_str) && strchr(pisa, 'v')) return CPU_C910V; @@ -140,5 +160,5 @@ void get_cpuconfig(void){ } void get_libname(void){ - printf("riscv64\n"); + printf("%s", cpuname_lower[detect()]); } diff --git a/ctest.c b/ctest.c index 2ccae8dcc..cbc15326f 100644 --- a/ctest.c +++ b/ctest.c @@ -173,6 +173,10 @@ HAVE_C11 ARCH_E2K #endif +#if defined(__csky__) +ARCH_CSKY +#endif + #if defined(__EMSCRIPTEN__) ARCH_RISCV64 OS_WINDOWS diff --git a/ctest/Makefile b/ctest/Makefile index af5b34a36..ad960b35a 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -218,6 +218,9 @@ ifeq ($(F_COMPILER), IBM) ifeq ($(C_COMPILER), GCC) CEXTRALIB += -lgomp endif +ifeq ($(C_COMPILER), CLANG) +CEXTRALIB += -lomp +endif endif endif diff --git a/ctest/c_cblat1.f b/ctest/c_cblat1.f index 1a123d74d..cad7c7fa7 100644 --- a/ctest/c_cblat1.f +++ b/ctest/c_cblat1.f @@ -96,7 +96,7 @@ INTEGER ICAMAXTEST EXTERNAL SCASUMTEST, SCNRM2TEST, ICAMAXTEST * .. External Subroutines .. - EXTERNAL CSCAL, CSSCALTEST, CTEST, ITEST1, STEST1 + EXTERNAL CSCALTEST, CSSCALTEST, CTEST, ITEST1, STEST1 * .. Intrinsic Functions .. INTRINSIC MAX * .. Common blocks .. @@ -214,8 +214,8 @@ CALL STEST1(SCASUMTEST(N,CX,INCX),STRUE4(NP1), + STRUE4(NP1),SFAC) ELSE IF (ICASE.EQ.8) THEN -* .. CSCAL .. - CALL CSCAL(N,CA,CX,INCX) +* .. CSCALTEST .. + CALL CSCALTEST(N,CA,CX,INCX) CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX), + SFAC) ELSE IF (ICASE.EQ.9) THEN @@ -236,14 +236,14 @@ * INCX = 1 IF (ICASE.EQ.8) THEN -* CSCAL +* CSCALTEST * Add a test for alpha equal to zero. CA = (0.0E0,0.0E0) DO 80 I = 1, 5 MWPCT(I) = (0.0E0,0.0E0) MWPCS(I) = (1.0E0,1.0E0) 80 CONTINUE - CALL CSCAL(5,CA,CX,INCX) + CALL CSCALTEST(5,CA,CX,INCX) CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) ELSE IF (ICASE.EQ.9) THEN * CSSCALTEST diff --git a/ctest/c_cblat1c.c b/ctest/c_cblat1c.c index b4c512436..d9a539097 100644 --- a/ctest/c_cblat1c.c +++ b/ctest/c_cblat1c.c @@ -440,6 +440,7 @@ static real c_b43 = (float)1.; extern /* Subroutine */ int ctest_(integer*, complex*, complex*, complex*, real*); static complex mwpcs[5], mwpct[5]; extern /* Subroutine */ int itest1_(integer*, integer*), stest1_(real*,real*,real*,real*); + extern /* Subroutine */ int cscaltest_(), itest1_(), stest1_(); static complex cx[8]; extern real scnrm2test_(integer*, complex*, integer*); static integer np1; @@ -481,7 +482,7 @@ static real c_b43 = (float)1.; stest1_(&r__1, &strue4[np1 - 1], &strue4[np1 - 1], sfac); } else if (combla_1.icase == 8) { /* .. CSCAL .. */ - cscal_(&combla_1.n, &ca, cx, &combla_1.incx); + cscaltest_(&combla_1.n, &ca, cx, &combla_1.incx); ctest_(&len, cx, &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48], &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48], sfac); } else if (combla_1.icase == 9) { @@ -515,7 +516,7 @@ static real c_b43 = (float)1.; mwpcs[i__1].r = (float)1., mwpcs[i__1].i = (float)1.; /* L80: */ } - cscal_(&c__5, &ca, cx, &combla_1.incx); + cscaltest_(&c__5, &ca, cx, &combla_1.incx); ctest_(&c__5, cx, mwpct, mwpcs, sfac); } else if (combla_1.icase == 9) { /* CSSCALTEST */ diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 2fcb37192..2531c57e9 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -113,6 +113,8 @@ extern unsigned int openblas_thread_timeout(void); /* We need this global for checking if initialization is finished. */ int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0; +int blas_omp_threads_local = 1; + /* Local Variables */ #if defined(USE_PTHREAD_LOCK) static pthread_mutex_t server_lock = PTHREAD_MUTEX_INITIALIZER; diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 213531057..4d3d1e0de 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -69,6 +69,7 @@ int blas_server_avail = 0; int blas_omp_number_max = 0; +int blas_omp_threads_local = 1; extern int openblas_omp_adaptive_env(void); diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 89ce9e656..2ad8b8c5f 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -65,6 +65,8 @@ static CRITICAL_SECTION queue_lock; /* We need this global for checking if initialization is finished. */ int blas_server_avail = 0; +int blas_omp_threads_local = 1; + /* Local Variables */ static BLASULONG server_lock = 0; diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 69a473060..e3f905265 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -275,6 +275,7 @@ extern gotoblas_t gotoblas_EXCAVATOR; #define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE #define gotoblas_COOPERLAKE gotoblas_SANDYBRIDGE #define gotoblas_ZEN gotoblas_SANDYBRIDGE +#define gotoblas_SAPPHIRERAPIDS gotoblas_SANDYBRIDGE #else extern gotoblas_t gotoblas_HASWELL; extern gotoblas_t gotoblas_ZEN; diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 530d18115..6b21028d1 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -1,6 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ -/* Copyright 2023 The OpenBLAS Project */ +/* Copyright 2023-2024 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -122,10 +122,11 @@ extern gotoblas_t gotoblas_CORTEXA55; #endif #else extern gotoblas_t gotoblas_CORTEXA53; +#define gotoblas_CORTEXA55 gotoblas_CORTEXA53 extern gotoblas_t gotoblas_CORTEXA57; -extern gotoblas_t gotoblas_CORTEXA72; -extern gotoblas_t gotoblas_CORTEXA73; -extern gotoblas_t gotoblas_FALKOR; +#define gotoblas_CORTEXA72 gotoblas_CORTEXA57 +#define gotoblas_CORTEXA73 gotoblas_CORTEXA57 +#define gotoblas_FALKOR gotoblas_CORTEXA57 extern gotoblas_t gotoblas_THUNDERX; extern gotoblas_t gotoblas_THUNDERX2T99; extern gotoblas_t gotoblas_TSV110; @@ -141,14 +142,14 @@ extern gotoblas_t gotoblas_ARMV8SVE; #define gotoblas_ARMV8SVE gotoblas_ARMV8 #endif extern gotoblas_t gotoblas_THUNDERX3T110; -extern gotoblas_t gotoblas_CORTEXA55; #endif +#define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEV1 extern void openblas_warning(int verbose, const char * msg); #define FALLBACK_VERBOSE 1 #define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n" -#define NUM_CORETYPES 16 +#define NUM_CORETYPES 17 /* * In case asm/hwcap.h is outdated on the build system, make sure @@ -178,6 +179,7 @@ static char *corename[] = { "emag8180", "neoversen1", "neoversev1", + "neoversev2", "neoversen2", "thunderx3t110", "cortexa55", @@ -198,10 +200,11 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_EMAG8180) return corename[ 9]; if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10]; if (gotoblas == &gotoblas_NEOVERSEV1) return corename[11]; - if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12]; - if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13]; - if (gotoblas == &gotoblas_CORTEXA55) return corename[14]; - if (gotoblas == &gotoblas_ARMV8SVE) return corename[15]; + if (gotoblas == &gotoblas_NEOVERSEV2) return corename[12]; + if (gotoblas == &gotoblas_NEOVERSEN2) return corename[13]; + if (gotoblas == &gotoblas_THUNDERX3T110) return corename[14]; + if (gotoblas == &gotoblas_CORTEXA55) return corename[15]; + if (gotoblas == &gotoblas_ARMV8SVE) return corename[16]; return corename[NUM_CORETYPES]; } @@ -233,10 +236,11 @@ static gotoblas_t *force_coretype(char *coretype) { case 9: return (&gotoblas_EMAG8180); case 10: return (&gotoblas_NEOVERSEN1); case 11: return (&gotoblas_NEOVERSEV1); - case 12: return (&gotoblas_NEOVERSEN2); - case 13: return (&gotoblas_THUNDERX3T110); - case 14: return (&gotoblas_CORTEXA55); - case 15: return (&gotoblas_ARMV8SVE); + case 12: return (&gotoblas_NEOVERSEV2); + case 13: return (&gotoblas_NEOVERSEN2); + case 14: return (&gotoblas_THUNDERX3T110); + case 15: return (&gotoblas_CORTEXA55); + case 16: return (&gotoblas_ARMV8SVE); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); @@ -247,6 +251,10 @@ static gotoblas_t *get_coretype(void) { int implementer, variant, part, arch, revision, midr_el1; char coremsg[128]; +#if defined (OS_DARWIN) + return &gotoblas_NEOVERSEN1; +#endif + #if (!defined OS_LINUX && !defined OS_ANDROID) return NULL; #else @@ -308,6 +316,13 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_NEOVERSEN1; }else return &gotoblas_NEOVERSEV1; + case 0xd4f: + if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) { + openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK); + return &gotoblas_NEOVERSEN1; + } else { + return &gotoblas_NEOVERSEV2; + } #endif case 0xd05: // Cortex A55 return &gotoblas_CORTEXA55; @@ -352,6 +367,9 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_FALKOR; } break; + case 0x61: // Apple + return &gotoblas_NEOVERSEN1; + break; default: snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part); openblas_warning(1, coremsg); diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 0454f186c..4c1f4a26e 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -43,6 +43,13 @@ char *gotoblas_corename(void) { #define CPU_POWER9 9 #define CPU_POWER10 10 +#ifndef POWER_9 +#define POWER_9 0x20000 /* 9 class CPU */ +#endif +#ifndef POWER_10 +#define POWER_10 0x40000 /* 10 class CPU */ +#endif + #ifdef _AIX #include @@ -62,7 +69,7 @@ static int cpuid(void) else if (arch == POWER_9) return CPU_POWER9; #endif #ifdef POWER_10 - else if (arch == POWER_10) return CPU_POWER10; + else if (arch >= POWER_10) return CPU_POWER10; #endif return CPU_UNKNOWN; } @@ -332,6 +339,9 @@ void gotoblas_dynamic_init(void) { if (gotoblas && gotoblas -> init) { strncpy(coren,gotoblas_corename(),20); sprintf(coremsg, "Core: %s\n",coren); + if (getenv("GET_OPENBLAS_CORETYPE")) { + fprintf(stderr, "%s", coremsg); + } openblas_warning(2, coremsg); gotoblas -> init(); } else { diff --git a/driver/others/memory.c b/driver/others/memory.c index caef3e2b7..4ee8f9a2e 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -3214,7 +3214,7 @@ void blas_shutdown(void){ #endif memory[pos].lock = 0; } - if (memory_overflowed) + if (memory_overflowed) { for (pos = 0; pos < NEW_BUFFERS; pos ++){ newmemory[pos].addr = (void *)0; newmemory[pos].used = 0; @@ -3222,6 +3222,10 @@ void blas_shutdown(void){ newmemory[pos].pos = -1; #endif newmemory[pos].lock = 0; + } + free(newmemory); + newmemory = NULL; + memory_overflowed = 0; } UNLOCK_COMMAND(&alloc_lock); diff --git a/driver/others/openblas_set_num_threads.c b/driver/others/openblas_set_num_threads.c index 0b57867b0..c938fde69 100644 --- a/driver/others/openblas_set_num_threads.c +++ b/driver/others/openblas_set_num_threads.c @@ -36,11 +36,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef SMP_SERVER extern void openblas_set_num_threads(int num_threads) ; +extern int openblas_get_num_threads(void) ; void openblas_set_num_threads_(int* num_threads){ openblas_set_num_threads(*num_threads); } +int openblas_set_num_threads_local(int num_threads){ + int ret = openblas_get_num_threads(); + openblas_set_num_threads(num_threads); + blas_omp_threads_local=num_threads; + return ret; +} + + #else //Single thread @@ -50,4 +59,8 @@ void openblas_set_num_threads(int num_threads) { void openblas_set_num_threads_(int* num_threads){ } + +int openblas_set_num_threads_local(int num_threads){ + return 1; +} #endif diff --git a/exports/gensymbol b/exports/gensymbol index 704eab06f..226035842 100755 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -60,6 +60,7 @@ cblasobjsc=" cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv cblas_scnrm2 cblas_scasum cblas_cgemmt cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy + cblas_caxpyc cblas_crotg cblas_csrot cblas_scamax cblas_scamin " cblasobjsd=" cblas_dasum cblas_daxpy cblas_dcopy cblas_ddot @@ -69,6 +70,7 @@ cblasobjsd=" cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy + cblas_damax cblas_damin " cblasobjss=" @@ -80,6 +82,7 @@ cblasobjss=" cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm cblas_strsv cblas_sgeadd cblas_sgemmt cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy + cblas_samax cblas_samin " cblasobjsz=" @@ -91,6 +94,7 @@ cblasobjsz=" cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub cblas_zaxpby cblas_zgeadd cblas_zgemmt cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy + cblas_zaxpyc cblas_zdrot cblas_zrotg cblas_dzamax cblas_dzamin " cblasobjs="cblas_xerbla" @@ -861,6 +865,53 @@ lapackobjs2z="$lapackobjs2z zgedmd zgedmdq " + +#functions added post 3.11 + +lapackobjs2c="$lapackobjs2c + claqp2rk + claqp3rk + ctrsyl3 + " +# claqz0 +# claqz1 +# claqz2 +# claqz3 +# clatrs3 + +lapackobjs2d="$lapackobjs2d + dgelqs + dgelst + dgeqp3rk + dgeqrs + dlaqp2rk + dlaqp3rk + dlarmm + dlatrs3 + dtrsyl3 + " +# dlaqz0 +# dlaqz1 +# dlaqz2 +# dlaqz3 +# dlaqz4 + +lapackobjs2z="$lapackobjs2z + zgelqs + zgelst + zgeqp3rk + zgeqrs + zlaqp2rk + zlaqp3rk + zlatrs3 + zrscl + ztrsyl3 + " +# zlaqz0 +# zlaqz1 +# zlaqz2 +# zlaqz3 + lapack_extendedprecision_objs=" zposvxx clagge clatms chesvxx cposvxx cgesvxx ssyrfssx csyrfsx dlagsy dsysvxx sporfsx slatms zlatms zherfsx csysvxx @@ -1622,6 +1673,14 @@ lapackeobjsc=" LAPACKE_cgetsqrhrt_work LAPACKE_cungtsqr_row LAPACKE_cungtsqr_row_work + LAPACKE_clangb + LAPACKE_clangb_work + LAPACKE_ctrsyl3 + LAPACKE_ctrsyl3_work + LAPACKE_ctz_nancheck + LAPACKE_ctz_trans + LAPACKE_cunhr_col + LAPACKE_cunhr_col_work " lapackeobjsd=" @@ -2239,6 +2298,14 @@ lapackeobjsd=" LAPACKE_dgetsqrhrt_work LAPACKE_dorgtsqr_row LAPACKE_dorgtsqr_row_work + LAPACKE_dlangb + LAPACKE_dlangb_work + LAPACKE_dorhr_col + LAPACKE_dorhr_col_work + LAPACKE_dtrsyl3 + LAPACKE_dtrsyl3_work + LAPACKE_dtz_nancheck + LAPACKE_dtz_trans " lapackeobjss=" @@ -2848,6 +2915,14 @@ lapackeobjss=" LAPACKE_sgetsqrhrt_work LAPACKE_sorgtsqr_row LAPACKE_sorgtsqr_row_work + LAPACKE_slangb + LAPACKE_slangb_work + LAPACKE_sorhr_col + LAPACKE_sorhr_col_work + LAPACKE_strsyl3 + LAPACKE_strsyl3_work + LAPACKE_stz_nancheck + LAPACKE_stz_trans " lapackeobjsz=" @@ -3515,6 +3590,14 @@ lapackeobjsz=" LAPACKE_zgetsqrhrt_work LAPACKE_zungtsqr_row LAPACKE_zungtsqr_row_work + LAPACKE_zlangb + LAPACKE_zlangb_work + LAPACKE_ztrsyl3 + LAPACKE_ztrsyl3_work + LAPACKE_ztz_nancheck + LAPACKE_ztz_trans + LAPACKE_zunhr_col + LAPACKE_zunhr_col_work " ## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` ## Not exported: requires LAPACKE_EXTENDED to be set and depends on the @@ -3616,6 +3699,7 @@ lapack_embeded_underscore_objs_s=" ssysv_aa_2stage ssytrf_aa_2stage ssytrs_aa_2stage slaorhr_col_getrfnp slaorhr_col_getrfnp2 sorhr_col + slarfb_gett " lapack_embeded_underscore_objs_c=" chetf2_rook chetrf_rook chetri_rook @@ -3641,6 +3725,7 @@ lapack_embeded_underscore_objs_c=" csysv_aa_2stage csytrf_aa_2stage csytrs_aa_2stage claunhr_col_getrfnp claunhr_col_getrfnp2 cunhr_col + clarfb_gett " lapack_embeded_underscore_objs_d=" dlasyf_rook @@ -3658,6 +3743,7 @@ lapack_embeded_underscore_objs_d=" dsysv_aa_2stage dsytrf_aa_2stage dsytrs_aa_2stage dlaorhr_col_getrfnp dlaorhr_col_getrfnp2 dorhr_col + dlarfb_gett " lapack_embeded_underscore_objs_z=" zhetf2_rook zhetrf_rook zhetri_rook @@ -3682,6 +3768,7 @@ lapack_embeded_underscore_objs_z=" zhetrs_aa_2stage zsysv_aa_2stage zsytrf_aa_2stage zsytrs_aa_2stage zlaunhr_col_getrfnp zlaunhr_col_getrfnp2 zunhr_col + zlarfb_gett " dirname=`pwd -P`/../lapack-netlib diff --git a/f_check b/f_check index dac34edee..81f598ffa 100755 --- a/f_check +++ b/f_check @@ -45,7 +45,7 @@ if [ -z "$compiler" ]; then pathf90 pathf95 pgf95 pgf90 pgf77 pgfortran nvfortran flang egfortran - ifort nagfor ifx ftn crayftn" + ifort nagfor ifx ftn crayftn armflang" for list in $lists; do for p in $path; do @@ -85,7 +85,11 @@ else *Hewlett*) vendor=CRAY openmp='-fopenmp' - ;; + ;; + *Arm\ F90*) + vendor=FLANG + openmp='-fopenmp' + ;; *GNU*|*GCC*) v="${data#*GCC: *\) }" @@ -108,7 +112,7 @@ else if [ "$major" -ge 17 ]; then vendor=FLANGNEW fi - ;; + ;; *ifort*|*ifx*) vendor=INTEL openmp='-fopenmp' diff --git a/getarch.c b/getarch.c index 87384c084..f879e6bbb 100644 --- a/getarch.c +++ b/getarch.c @@ -150,6 +150,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_EV4 */ /* #define FORCE_EV5 */ /* #define FORCE_EV6 */ +/* #define FORCE_CSKY */ +/* #define FORCE_CK860FV */ /* #define FORCE_GENERIC */ #ifdef FORCE_P2 @@ -1677,9 +1679,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define LIBNAME "c910v" #define CORENAME "C910V" #endif +#endif +#ifdef FORCE_x280 +#define FORCE +#define ARCHITECTURE "RISCV64" +#define SUBARCHITECTURE "x280" +#define SUBDIRNAME "riscv64" +#define ARCHCONFIG "-Dx280 " \ + "-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "x280" +#define CORENAME "x280" #else #endif +#ifdef FORCE_RISCV64_ZVL256B +#define FORCE +#define ARCHITECTURE "RISCV64" +#define SUBARCHITECTURE "RISCV64_ZVL256B" +#define SUBDIRNAME "riscv64" +#define ARCHCONFIG "-DRISCV64_ZVL256B " \ + "-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "riscv64_zvl256b" +#define CORENAME "RISCV64_ZVL256B" +#endif + +#ifdef FORCE_RISCV64_ZVL128B +#define FORCE +#define ARCHITECTURE "RISCV64" +#define SUBARCHITECTURE "RISCV64_ZVL128B" +#define SUBDIRNAME "riscv64" +#define ARCHCONFIG "-DRISCV64_ZVL128B " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "riscv64_zvl128b" +#define CORENAME "RISCV64_ZVL128B" +#endif #if defined(FORCE_E2K) || defined(__e2k__) #define FORCE @@ -1692,6 +1731,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "generic" #endif +#ifdef FORCE_CSKY +#define FORCE +#define ARCHITECTURE "CSKY" +#define SUBARCHITECTURE "CSKY" +#define SUBDIRNAME "csky" +#define ARCHCONFIG "-DCSKY" \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "csky" +#define CORENAME "CSKY" +#endif + +#ifdef FORCE_CK860FV +#define FORCE +#define ARCHITECTURE "CSKY" +#define SUBARCHITECTURE "CK860V" +#define SUBDIRNAME "csky" +#define ARCHCONFIG "-DCK860FV " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "ck860fv" +#define CORENAME "CK860FV" +#endif + + #ifndef FORCE #ifdef USER_TARGET @@ -1766,7 +1832,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OPENBLAS_SUPPORTED #endif - #ifndef OPENBLAS_SUPPORTED #error "This arch/CPU is not supported by OpenBLAS." #endif @@ -1831,7 +1896,7 @@ int main(int argc, char *argv[]){ #ifdef FORCE printf("CORE=%s\n", CORENAME); #else -#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__) +#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__) || defined(__csky__) printf("CORE=%s\n", get_corename()); #endif #endif @@ -1979,7 +2044,7 @@ printf("ELF_VERSION=2\n"); #ifdef FORCE printf("#define CHAR_CORENAME \"%s\"\n", CORENAME); #else -#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) +#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__csky__) printf("#define CHAR_CORENAME \"%s\"\n", get_corename()); #endif #endif diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 4e082928b..55374674a 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -119,6 +119,7 @@ endif () if (BUILD_BFLOAT16) GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("gemmt.c" "" "sbgemmt" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") @@ -130,6 +131,8 @@ endif () foreach (float_type ${FLOAT_TYPES}) if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") + GenerateNamedObjects("zaxpy.c" "" "axpyc" ${CBLAS_FLAG} "" "" false ${float_type}) + GenerateNamedObjects("zger.c" "" "geru" ${CBLAS_FLAG} "" "" false ${float_type}) GenerateNamedObjects("zger.c" "CONJ" "gerc" ${CBLAS_FLAG} "" "" false ${float_type}) GenerateNamedObjects("zdot.c" "CONJ" "dotc" ${CBLAS_FLAG} "" "" false ${float_type}) diff --git a/interface/Makefile b/interface/Makefile index 78335357b..048d679d6 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -270,7 +270,8 @@ CSBLAS1OBJS = \ cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \ - cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) + cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) cblas_samax.$(SUFFIX) \ + cblas_samin.$(SUFFIX) CSBLAS2OBJS = \ cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \ @@ -295,7 +296,8 @@ CDBLAS1OBJS = \ cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \ - cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) + cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) cblas_damax.$(SUFFIX) \ + cblas_damin.$(SUFFIX) CDBLAS2OBJS = \ cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \ @@ -315,7 +317,7 @@ CCBLAS1OBJS = \ cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ - cblas_caxpby.$(SUFFIX) \ + cblas_caxpby.$(SUFFIX) cblas_scamax.$(SUFFIX) cblas_caxpyc.$(SUFFIX) cblas_scamin.$(SUFFIX) \ cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX) CCBLAS2OBJS = \ @@ -340,12 +342,12 @@ CXERBLAOBJ = \ CZBLAS1OBJS = \ cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ - cblas_zcopy.$(SUFFIX) \ + cblas_zcopy.$(SUFFIX) cblas_dzamax.$(SUFFIX) cblas_dzamin.$(SUFFIX) \ cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \ cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ - cblas_zaxpby.$(SUFFIX) \ + cblas_zaxpby.$(SUFFIX) cblas_zaxpyc.$(SUFFIX) \ cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX) @@ -1301,7 +1303,7 @@ xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c ifeq ($(BUILD_BFLOAT16),1) sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) -sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : gemmt.c ../param.h +sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) endif @@ -1533,6 +1535,30 @@ cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) +cblas_samax.$(SUFFIX) cblas_samax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_damax.$(SUFFIX) cblas_damax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_scamax.$(SUFFIX) cblas_scamax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_dzamax.$(SUFFIX) cblas_dzamax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_samin.$(SUFFIX) cblas_samin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_damin.$(SUFFIX) cblas_damin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_scamin.$(SUFFIX) cblas_scamin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_dzamin.$(SUFFIX) cblas_dzamin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) @@ -1627,6 +1653,15 @@ cblas_daxpy.$(SUFFIX) cblas_daxpy.$(PSUFFIX) : axpy.c cblas_caxpy.$(SUFFIX) cblas_caxpy.$(PSUFFIX) : zaxpy.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) +cblas_caxpyc.$(SUFFIX) cblas_caxpyc.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) + +cblas_zaxpyc.$(SUFFIX) cblas_zaxpyc.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) + +cblas_xaxpyc.$(SUFFIX) cblas_xaxpyc.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) + cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) @@ -1932,7 +1967,7 @@ cblas_sgemmt.$(SUFFIX) cblas_sgemmt.$(PSUFFIX) : gemmt.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) ifeq ($(BUILD_BFLOAT16),1) -cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : gemmt.c ../param.h +cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) endif diff --git a/interface/gemmt.c b/interface/gemmt.c index 046432670..018deb7fb 100644 --- a/interface/gemmt.c +++ b/interface/gemmt.c @@ -78,6 +78,9 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB, char transA, transB, Uplo; blasint nrowa, nrowb; +#if defined(COMPLEX) + blasint ncolb; +#endif IFLOAT *buffer; IFLOAT *aa, *bb; FLOAT *cc; @@ -155,19 +158,27 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB, uplo = 0; if (Uplo == 'L') uplo = 1; - + nrowa = m; - if (transa) nrowa = k; + if (transa & 1) nrowa = k; nrowb = k; - if (transb) nrowb = m; +#if defined(COMPLEX) + ncolb = m; +#endif + if (transb & 1) { + nrowb = m; +#if defined(COMPLEX) + ncolb = k; +#endif + } info = 0; if (ldc < MAX(1, m)) info = 13; - if (ldb < MAX(1, nrowa)) + if (ldb < MAX(1, nrowb)) info = 10; - if (lda < MAX(1, nrowb)) + if (lda < MAX(1, nrowa)) info = 8; if (k < 0) info = 5; @@ -211,6 +222,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint info; blasint lda, ldb; FLOAT *a, *b; +#if defined(COMPLEX) + blasint nrowb, ncolb; +#endif XFLOAT *buffer; PRINT_DEBUG_CNAME; @@ -262,11 +276,22 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, info = -1; - blasint nrowa, nrowb; + blasint nrowa; +#if !defined(COMPLEX) + blasint nrowb; +#endif nrowa = m; - if (transa) nrowa = k; + if (transa & 1) nrowa = k; nrowb = k; - if (transb) nrowb = m; +#if defined(COMPLEX) + ncolb = m; +#endif + if (transb & 1) { + nrowb = m; +#if defined(COMPLEX) + ncolb = k; +#endif + } if (ldc < MAX(1, m)) info = 13; @@ -330,26 +355,38 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, info = -1; - blasint ncola, ncolb; - ncola = k; - if (transa) ncola = m; - ncolb = m; - if (transb) ncolb = k; + blasint ncola; +#if !defined(COMPLEX) + blasint ncolb; +#endif + ncola = m; + if (transa & 1) ncola = k; + ncolb = k; +#if defined(COMPLEX) + nrowb = m; +#endif + + if (transb & 1) { +#if defined(COMPLEX) + nrowb = k; +#endif + ncolb = m; + } if (ldc < MAX(1,m)) info = 13; if (ldb < MAX(1, ncolb)) - info = 10; - if (lda < MAX(1, ncola)) info = 8; + if (lda < MAX(1, ncola)) + info = 10; if (k < 0) info = 5; if (m < 0) info = 4; if (transb < 0) - info = 3; - if (transa < 0) info = 2; + if (transa < 0) + info = 3; if (uplo < 0) info = 1; } @@ -428,7 +465,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, IDEBUG_START; - const blasint incb = (transb == 0) ? 1 : ldb; +#if defined(COMPLEX) + if (transb > 1){ +#ifndef CBLAS + IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); +#else + if (order == CblasColMajor) + IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); + if (order == CblasRowMajor) + IMATCOPY_K_RNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); +#endif + } +#endif + + const blasint incb = ((transb & 1) == 0) ? 1 : ldb; if (uplo == 1) { for (i = 0; i < m; i++) { @@ -438,19 +488,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #if defined(COMPLEX) aa = a + i * 2; bb = b + i * ldb * 2; - if (transa) { + if (transa & 1) { aa = a + lda * i * 2; } - if (transb) + if (transb & 1) bb = b + i * 2; cc = c + i * 2 * ldc + i * 2; #else aa = a + i; bb = b + i * ldb; - if (transa) { + if (transa & 1) { aa = a + lda * i; } - if (transb) + if (transb & 1) bb = b + i; cc = c + i * ldc + i; #endif @@ -461,7 +511,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, NULL, 0); if (alpha_r == ZERO && alpha_i == ZERO) - return; + continue; #else if (beta != ONE) SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); @@ -478,7 +528,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif // for alignment buffer_size = (buffer_size + 3) & ~3; - STACK_ALLOC(buffer_size, FLOAT, buffer); + STACK_ALLOC(buffer_size, IFLOAT, buffer); #ifdef SMP @@ -491,7 +541,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif #if defined(COMPLEX) - if (!transa) + if (!(transa & 1)) (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, aa, lda, bb, incb, cc, 1, buffer); @@ -500,7 +550,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, aa, lda, bb, incb, cc, 1, buffer); #else - if (!transa) + if (!(transa & 1)) (gemv[(int)transa]) (j, k, 0, alpha, aa, lda, bb, incb, cc, 1, buffer); else @@ -509,7 +559,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif #ifdef SMP } else { - if (!transa) + if (!(transa & 1)) (gemv_thread[(int)transa]) (j, k, alpha, aa, lda, bb, incb, cc, 1, buffer, @@ -533,13 +583,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, l = j; #if defined COMPLEX bb = b + i * ldb * 2; - if (transb) { + if (transb & 1) { bb = b + i * 2; } cc = c + i * 2 * ldc; #else bb = b + i * ldb; - if (transb) { + if (transb & 1) { bb = b + i; } cc = c + i * ldc; @@ -551,7 +601,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, NULL, 0); if (alpha_r == ZERO && alpha_i == ZERO) - return; + continue; #else if (beta != ONE) SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); @@ -567,7 +617,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif // for alignment buffer_size = (buffer_size + 3) & ~3; - STACK_ALLOC(buffer_size, FLOAT, buffer); + STACK_ALLOC(buffer_size, IFLOAT, buffer); #ifdef SMP @@ -580,7 +630,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif #if defined(COMPLEX) - if (!transa) + if (!(transa & 1)) (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, a, lda, bb, incb, cc, 1, buffer); @@ -589,7 +639,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, a, lda, bb, incb, cc, 1, buffer); #else - if (!transa) + if (!(transa & 1)) (gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb, incb, cc, 1, buffer); else @@ -599,7 +649,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #ifdef SMP } else { - if (!transa) + if (!(transa & 1)) (gemv_thread[(int)transa]) (j, k, alpha, a, lda, bb, incb, cc, 1, buffer, nthreads); diff --git a/interface/gemv.c b/interface/gemv.c index 1f0763579..2c121f130 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -226,7 +226,7 @@ void CNAME(enum CBLAS_ORDER order, #ifdef SMP - if ( 1L * m * n < 2304L * GEMM_MULTITHREAD_THRESHOLD ) + if ( 1L * m * n < 115200L * GEMM_MULTITHREAD_THRESHOLD ) nthreads = 1; else nthreads = num_cpu_avail(2); diff --git a/interface/imatcopy.c b/interface/imatcopy.c index 6a1ad282c..69876e31e 100644 --- a/interface/imatcopy.c +++ b/interface/imatcopy.c @@ -154,7 +154,10 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, } #endif - msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT); + if ( *rows > *cols ) + msize = (size_t)(*rows) * (*ldb) * sizeof(FLOAT); + else + msize = (size_t)(*cols) * (*ldb) * sizeof(FLOAT); b = malloc(msize); if ( b == NULL ) diff --git a/interface/lapack/gesv.c b/interface/lapack/gesv.c index 175350329..546c2bed2 100644 --- a/interface/lapack/gesv.c +++ b/interface/lapack/gesv.c @@ -114,7 +114,14 @@ int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv, #ifdef SMP args.common = NULL; - args.nthreads = num_cpu_avail(4); +#ifndef DOUBLE + if (args.m*args.n < 40000) +#else + if (args.m*args.n < 10000) +#endif + args.nthreads=1; + else + args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { #endif diff --git a/interface/max.c b/interface/max.c index f05977448..6c7d32bd9 100644 --- a/interface/max.c +++ b/interface/max.c @@ -145,8 +145,13 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ #else +#ifdef COMPLEX +FLOAT CNAME(blasint n, void *vx, blasint incx){ + FLOAT *x = (FLOAT*) vx; +#else FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ - +#endif + FLOAT ret; PRINT_DEBUG_CNAME; diff --git a/interface/rotmg.c b/interface/rotmg.c index 3a5ca8f95..b8f627221 100644 --- a/interface/rotmg.c +++ b/interface/rotmg.c @@ -96,12 +96,6 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ else { dp2 = *dd2 * dy1; - if(dp2 == ZERO) - { - dflag = -TWO; - dparam[0] = dflag; - return; - } dp1 = *dd1 * *dx1; dq2 = dp2 * dy1; dq1 = dp1 * *dx1; @@ -113,24 +107,10 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ dh12 = dp2 / dp1; du = ONE - dh12 * dh21; - if(du > ZERO) - { - dflag = ZERO; - *dd1 = *dd1 / du; - *dd2 = *dd2 / du; - *dx1 = *dx1 * du; - } else { - dflag = -ONE; - - dh11 = ZERO; - dh12 = ZERO; - dh21 = ZERO; - dh22 = ZERO; - - *dd1 = ZERO; - *dd2 = ZERO; - *dx1 = ZERO; - } + dflag = ZERO; + *dd1 = *dd1 / du; + *dd2 = *dd2 / du; + *dx1 = *dx1 * du; } else diff --git a/interface/sbgemmt.c b/interface/sbgemmt.c new file mode 100644 index 000000000..759af4bfb --- /dev/null +++ b/interface/sbgemmt.c @@ -0,0 +1,447 @@ +/*********************************************************************/ +/* Copyright 2024, The OpenBLAS Project. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +#define SMP_THRESHOLD_MIN 65536.0 +#define ERROR_NAME "SBGEMMT " + +#ifndef GEMM_MULTITHREAD_THRESHOLD +#define GEMM_MULTITHREAD_THRESHOLD 4 +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANSA, char *TRANSB, + blasint * M, blasint * K, + FLOAT * Alpha, + IFLOAT * a, blasint * ldA, + IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC) +{ + + blasint m, k; + blasint lda, ldb, ldc; + int transa, transb, uplo; + blasint info; + + char transA, transB, Uplo; + blasint nrowa, nrowb; + IFLOAT *buffer; + IFLOAT *aa, *bb; + FLOAT *cc; + FLOAT alpha, beta; + + PRINT_DEBUG_NAME; + + m = *M; + k = *K; + + alpha = *Alpha; + beta = *Beta; + + lda = *ldA; + ldb = *ldB; + ldc = *ldC; + + transA = *TRANSA; + transB = *TRANSB; + Uplo = *UPLO; + TOUPPER(transA); + TOUPPER(transB); + TOUPPER(Uplo); + + transa = -1; + transb = -1; + uplo = -1; + + if (transA == 'N') + transa = 0; + if (transA == 'T') + transa = 1; + + if (transA == 'R') + transa = 0; + if (transA == 'C') + transa = 1; + + if (transB == 'N') + transb = 0; + if (transB == 'T') + transb = 1; + + if (transB == 'R') + transb = 0; + if (transB == 'C') + transb = 1; + + if (Uplo == 'U') + uplo = 0; + if (Uplo == 'L') + uplo = 1; + nrowa = m; + if (transa & 1) nrowa = k; + nrowb = k; + if (transb & 1) nrowb = m; + + info = 0; + + if (ldc < MAX(1, m)) + info = 13; + if (ldb < MAX(1, nrowb)) + info = 10; + if (lda < MAX(1, nrowa)) + info = 8; + if (k < 0) + info = 5; + if (m < 0) + info = 4; + if (transb < 0) + info = 3; + if (transa < 0) + info = 2; + if (uplo < 0) + info = 1; + + if (info != 0) { + BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m, + blasint k, + FLOAT alpha, + IFLOAT * A, blasint LDA, + IFLOAT * B, blasint LDB, FLOAT beta, FLOAT * c, blasint ldc) +{ + IFLOAT *aa, *bb; + FLOAT *cc; + + int transa, transb, uplo; + blasint info; + blasint lda, ldb; + IFLOAT *a, *b; + XFLOAT *buffer; + + PRINT_DEBUG_CNAME; + + uplo = -1; + transa = -1; + transb = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransA == CblasNoTrans) + transa = 0; + if (TransA == CblasTrans) + transa = 1; + + if (TransA == CblasConjNoTrans) + transa = 0; + if (TransA == CblasConjTrans) + transa = 1; + + if (TransB == CblasNoTrans) + transb = 0; + if (TransB == CblasTrans) + transb = 1; + + if (TransB == CblasConjNoTrans) + transb = 0; + if (TransB == CblasConjTrans) + transb = 1; + + a = (void *)A; + b = (void *)B; + lda = LDA; + ldb = LDB; + + info = -1; + + blasint nrowa; + blasint nrowb; + nrowa = m; + if (transa & 1) nrowa = k; + nrowb = k; + if (transb & 1) nrowb = m; + + if (ldc < MAX(1, m)) + info = 13; + if (ldb < MAX(1, nrowb)) + info = 10; + if (lda < MAX(1, nrowa)) + info = 8; + if (k < 0) + info = 5; + if (m < 0) + info = 4; + if (transb < 0) + info = 3; + if (transa < 0) + info = 2; + if (uplo < 0) + info = 1; + } + + if (order == CblasRowMajor) { + + a = (void *)B; + b = (void *)A; + + lda = LDB; + ldb = LDA; + + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransB == CblasNoTrans) + transa = 0; + if (TransB == CblasTrans) + transa = 1; + + if (TransB == CblasConjNoTrans) + transa = 0; + if (TransB == CblasConjTrans) + transa = 1; + + if (TransA == CblasNoTrans) + transb = 0; + if (TransA == CblasTrans) + transb = 1; + + if (TransA == CblasConjNoTrans) + transb = 0; + if (TransA == CblasConjTrans) + transb = 1; + + info = -1; + + blasint ncola; + blasint ncolb; + + ncola = m; + if (transa & 1) ncola = k; + ncolb = k; + + if (transb & 1) { + ncolb = m; + } + + if (ldc < MAX(1,m)) + info = 13; + if (ldb < MAX(1, ncolb)) + info = 8; + if (lda < MAX(1, ncola)) + info = 10; + if (k < 0) + info = 5; + if (m < 0) + info = 4; + if (transb < 0) + info = 2; + if (transa < 0) + info = 3; + if (uplo < 0) + info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + int buffer_size; + blasint i, j; + +#ifdef SMP + int nthreads; +#endif + + +#ifdef SMP + static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *, + BLASLONG, IFLOAT *, BLASLONG, FLOAT, + FLOAT *, BLASLONG, int) = { + sbgemv_thread_n, sbgemv_thread_t, + }; +#endif + int (*gemv[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *, BLASLONG, + IFLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG) = { + SBGEMV_N, SBGEMV_T,}; + + + if (m == 0) + return; + + IDEBUG_START; + + const blasint incb = ((transb & 1) == 0) ? 1 : ldb; + + if (uplo == 1) { + for (i = 0; i < m; i++) { + j = m - i; + + aa = a + i; + bb = b + i * ldb; + if (transa & 1) { + aa = a + lda * i; + } + if (transb & 1) + bb = b + i; + cc = c + i * ldc + i; + +#if 0 + if (beta != ONE) + SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); + + if (alpha == ZERO) + continue; +#endif + + IDEBUG_START; + + buffer_size = j + k + 128 / sizeof(FLOAT); +#ifdef WINDOWS_ABI + buffer_size += 160 / sizeof(FLOAT); +#endif + // for alignment + buffer_size = (buffer_size + 3) & ~3; + STACK_ALLOC(buffer_size, IFLOAT, buffer); + +#ifdef SMP + + if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD) + nthreads = 1; + else + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + if (!(transa & 1)) + (gemv[(int)transa]) (j, k, alpha, aa, lda, + bb, incb, beta, cc, 1); + else + (gemv[(int)transa]) (k, j, alpha, aa, lda, + bb, incb, beta, cc, 1); + +#ifdef SMP + } else { + if (!(transa & 1)) + (gemv_thread[(int)transa]) (j, k, alpha, aa, + lda, bb, incb, beta, cc, + 1, nthreads); + else + (gemv_thread[(int)transa]) (k, j, alpha, aa, + lda, bb, incb, beta, cc, + 1, nthreads); + + } +#endif + + STACK_FREE(buffer); + } + } else { + + for (i = 0; i < m; i++) { + j = i + 1; + + bb = b + i * ldb; + if (transb & 1) { + bb = b + i; + } + cc = c + i * ldc; + +#if 0 + if (beta != ONE) + SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); + + if (alpha == ZERO) + continue; +#endif + IDEBUG_START; + + buffer_size = j + k + 128 / sizeof(FLOAT); +#ifdef WINDOWS_ABI + buffer_size += 160 / sizeof(FLOAT); +#endif + // for alignment + buffer_size = (buffer_size + 3) & ~3; + STACK_ALLOC(buffer_size, IFLOAT, buffer); + +#ifdef SMP + + if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD) + nthreads = 1; + else + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + if (!(transa & 1)) + (gemv[(int)transa]) (j, k, alpha, a, lda, bb, + incb, beta, cc, 1); + else + (gemv[(int)transa]) (k, j, alpha, a, lda, bb, + incb, beta, cc, 1); + +#ifdef SMP + } else { + if (!(transa & 1)) + (gemv_thread[(int)transa]) (j, k, alpha, a, lda, + bb, incb, beta, cc, 1, + nthreads); + else + (gemv_thread[(int)transa]) (k, j, alpha, a, lda, + bb, incb, beta, cc, 1, + nthreads); + } +#endif + + STACK_FREE(buffer); + } + } + + IDEBUG_END; + + return; +} diff --git a/interface/zaxpby.c b/interface/zaxpby.c index 3a4db7403..e5065270d 100644 --- a/interface/zaxpby.c +++ b/interface/zaxpby.c @@ -39,12 +39,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef CBLAS -void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY) +void NAME(blasint *N, void *VALPHA, FLOAT *x, blasint *INCX, void *VBETA, FLOAT *y, blasint *INCY) { blasint n = *N; blasint incx = *INCX; blasint incy = *INCY; + FLOAT* ALPHA = (FLOAT*) VALPHA; + FLOAT* BETA = (FLOAT*) VBETA; #else diff --git a/interface/zimatcopy.c b/interface/zimatcopy.c index b66489eb7..b879c9ac2 100644 --- a/interface/zimatcopy.c +++ b/interface/zimatcopy.c @@ -183,7 +183,10 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, } #endif - msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT) * 2; + if ( *rows > *cols ) + msize = (size_t)(*rows) * (*ldb) * sizeof(FLOAT) * 2; + else + msize = (size_t)(*cols) * (*ldb) * sizeof(FLOAT) * 2; b = malloc(msize); if ( b == NULL ) diff --git a/kernel/arm/zscal.c b/kernel/arm/zscal.c index 0521aaa0b..b2d537d04 100644 --- a/kernel/arm/zscal.c +++ b/kernel/arm/zscal.c @@ -60,6 +60,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F else { temp = - da_i * x[ip+1] ; + if (isnan(x[ip]) || isinf(x[ip])) temp = NAN; x[ip+1] = da_i * x[ip] ; } } diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX index ccbce27e1..bc5999097 100644 --- a/kernel/arm64/KERNEL.A64FX +++ b/kernel/arm64/KERNEL.A64FX @@ -1,206 +1 @@ -SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c -CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c - -SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c - -SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c - -ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c -ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c - -ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c - -ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c - -STRSMKERNEL_LN = trsm_kernel_LN_sve.c -STRSMKERNEL_LT = trsm_kernel_LT_sve.c -STRSMKERNEL_RN = trsm_kernel_RN_sve.c -STRSMKERNEL_RT = trsm_kernel_RT_sve.c - -DTRSMKERNEL_LN = trsm_kernel_LN_sve.c -DTRSMKERNEL_LT = trsm_kernel_LT_sve.c -DTRSMKERNEL_RN = trsm_kernel_RN_sve.c -DTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -TRSMCOPYLN_M = trsm_lncopy_sve.c -TRSMCOPYLT_M = trsm_ltcopy_sve.c -TRSMCOPYUN_M = trsm_uncopy_sve.c -TRSMCOPYUT_M = trsm_utcopy_sve.c - -CTRSMKERNEL_LN = trsm_kernel_LN_sve.c -CTRSMKERNEL_LT = trsm_kernel_LT_sve.c -CTRSMKERNEL_RN = trsm_kernel_RN_sve.c -CTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c -ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c -ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c -ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c -ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c -ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c -ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c - - -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - -SAXPYKERNEL = axpy.S -DAXPYKERNEL = daxpy_thunderx2t99.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S - -SROTKERNEL = rot.S -DROTKERNEL = rot.S -CROTKERNEL = zrot.S -ZROTKERNEL = zrot.S - -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S -CSCALKERNEL = zscal.S -ZSCALKERNEL = zscal.S - -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S -CGEMVNKERNEL = zgemv_n.S -ZGEMVNKERNEL = zgemv_n.S - -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S -CGEMVTKERNEL = zgemv_t.S -ZGEMVTKERNEL = zgemv_t.S - -SASUMKERNEL = sasum_thunderx2t99.c -DASUMKERNEL = dasum_thunderx2t99.c -CASUMKERNEL = casum_thunderx2t99.c -ZASUMKERNEL = zasum_thunderx2t99.c - -SCOPYKERNEL = copy_thunderx2t99.c -DCOPYKERNEL = copy_thunderx2t99.c -CCOPYKERNEL = copy_thunderx2t99.c -ZCOPYKERNEL = copy_thunderx2t99.c - -SSWAPKERNEL = swap_thunderx2t99.S -DSWAPKERNEL = swap_thunderx2t99.S -CSWAPKERNEL = swap_thunderx2t99.S -ZSWAPKERNEL = swap_thunderx2t99.S - -ISAMAXKERNEL = iamax_thunderx2t99.c -IDAMAXKERNEL = iamax_thunderx2t99.c -ICAMAXKERNEL = izamax_thunderx2t99.c -IZAMAXKERNEL = izamax_thunderx2t99.c - -SNRM2KERNEL = scnrm2_thunderx2t99.c -DNRM2KERNEL = dznrm2_thunderx2t99.c -CNRM2KERNEL = scnrm2_thunderx2t99.c -ZNRM2KERNEL = dznrm2_thunderx2t99.c - -DDOTKERNEL = dot.c -SDOTKERNEL = dot.c -CDOTKERNEL = zdot_thunderx2t99.c -ZDOTKERNEL = zdot_thunderx2t99.c -DSDOTKERNEL = dot.S - -DGEMM_BETA = dgemm_beta.S -SGEMM_BETA = sgemm_beta.S - -SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S -STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S - -SGEMMINCOPY = gemm_ncopy_sve_v1x$(SGEMM_UNROLL_N).c -SGEMMITCOPY = gemm_tcopy_sve_v1x$(SGEMM_UNROLL_N).c -SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S -SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S - -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - -STRMMUNCOPY_M = trmm_uncopy_sve_v1.c -STRMMLNCOPY_M = trmm_lncopy_sve_v1.c -STRMMUTCOPY_M = trmm_utcopy_sve_v1.c -STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c - -SSYMMUCOPY_M = symm_ucopy_sve.c -SSYMMLCOPY_M = symm_lcopy_sve.c - -DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S -DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S - -DGEMMINCOPY = gemm_ncopy_sve_v1x$(DGEMM_UNROLL_N).c -DGEMMITCOPY = gemm_tcopy_sve_v1x$(DGEMM_UNROLL_N).c -DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S -DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S - -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - -DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c -DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c -DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c -DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c - -DSYMMUCOPY_M = symm_ucopy_sve.c -DSYMMLCOPY_M = symm_lcopy_sve.c - -CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S - -CGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c -CGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c -CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c - -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c -CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c -CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c -CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c - -CHEMMLTCOPY_M = zhemm_ltcopy_sve.c -CHEMMUTCOPY_M = zhemm_utcopy_sve.c - -CSYMMUCOPY_M = zsymm_ucopy_sve.c -CSYMMLCOPY_M = zsymm_lcopy_sve.c - -ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S - -ZGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c -ZGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c -ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c - -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c -ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c -ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c -ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c - -ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c -ZHEMMUTCOPY_M = zhemm_utcopy_sve.c - -ZSYMMUCOPY_M = zsymm_ucopy_sve.c -ZSYMMLCOPY_M = zsymm_lcopy_sve.c +include $(KERNELDIR)/KERNEL.ARMV8SVE diff --git a/kernel/arm64/KERNEL.CORTEXA55 b/kernel/arm64/KERNEL.CORTEXA55 index e2e006770..574e98b8c 100644 --- a/kernel/arm64/KERNEL.CORTEXA55 +++ b/kernel/arm64/KERNEL.CORTEXA55 @@ -1,196 +1 @@ -SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c -CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c - -SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c - -SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c - -ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c -ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c - -ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c - -ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - -SAXPYKERNEL = axpy.S -DAXPYKERNEL = axpy.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S - -SROTKERNEL = rot.S -DROTKERNEL = rot.S -CROTKERNEL = zrot.S -ZROTKERNEL = zrot.S - -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S -CSCALKERNEL = zscal.S -ZSCALKERNEL = zscal.S - -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S -CGEMVNKERNEL = zgemv_n.S -ZGEMVNKERNEL = zgemv_n.S - -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S -CGEMVTKERNEL = zgemv_t.S -ZGEMVTKERNEL = zgemv_t.S - - -SASUMKERNEL = asum.S -DASUMKERNEL = asum.S -CASUMKERNEL = casum.S -ZASUMKERNEL = zasum.S - -SCOPYKERNEL = copy.S -DCOPYKERNEL = copy.S -CCOPYKERNEL = copy.S -ZCOPYKERNEL = copy.S - -SSWAPKERNEL = swap.S -DSWAPKERNEL = swap.S -CSWAPKERNEL = swap.S -ZSWAPKERNEL = swap.S - -ISAMAXKERNEL = iamax.S -IDAMAXKERNEL = iamax.S -ICAMAXKERNEL = izamax.S -IZAMAXKERNEL = izamax.S - -SNRM2KERNEL = nrm2.S -DNRM2KERNEL = nrm2.S -CNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S - -ifneq ($(C_COMPILER), PGI) -SDOTKERNEL = ../generic/dot.c -else -SDOTKERNEL = dot.S -endif -DDOTKERNEL = dot.S -ifneq ($(C_COMPILER), PGI) -CDOTKERNEL = zdot.S -ZDOTKERNEL = zdot.S -else -CDOTKERNEL = ../arm/zdot.c -ZDOTKERNEL = ../arm/zdot.c -endif -DSDOTKERNEL = dot.S - -DGEMM_BETA = dgemm_beta.S -SGEMM_BETA = sgemm_beta.S - -ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8) -SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S -STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S -else -SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S -STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S -endif -ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) -ifeq ($(SGEMM_UNROLL_M), 16) -SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S -else -SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c -endif -ifeq ($(SGEMM_UNROLL_M), 4) -SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S -else -SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c -endif -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif - -SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S -SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - -DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_cortexa53.c -DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S - -ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) - -ifeq ($(DGEMM_UNROLL_M), 8) -DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S -DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S -else -DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c -DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c -endif - -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif - -ifeq ($(DGEMM_UNROLL_N), 4) -DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S -DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S -else -DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c -DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c -endif - -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - -CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_cortexa53.c -CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S -ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) -CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c -CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_cortexa53.c -ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S -ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) -ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c -ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) +include $(KERNELDIR)/KERNEL.CORTEXA53 diff --git a/kernel/arm64/KERNEL.THUNDERX3T110 b/kernel/arm64/KERNEL.THUNDERX3T110 index 41cedc851..5d3bd69f7 100644 --- a/kernel/arm64/KERNEL.THUNDERX3T110 +++ b/kernel/arm64/KERNEL.THUNDERX3T110 @@ -1,184 +1 @@ -SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c -CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c - -SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c - -SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c - -ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c -ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c - -ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c - -ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - -SAXPYKERNEL = axpy.S -DAXPYKERNEL = daxpy_thunderx2t99.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S - -SROTKERNEL = rot.S -DROTKERNEL = rot.S -CROTKERNEL = zrot.S -ZROTKERNEL = zrot.S - -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S -CSCALKERNEL = zscal.S -ZSCALKERNEL = zscal.S - -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S -CGEMVNKERNEL = zgemv_n.S -ZGEMVNKERNEL = zgemv_n.S - -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S -CGEMVTKERNEL = zgemv_t.S -ZGEMVTKERNEL = zgemv_t.S - -STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S -ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) -SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c -SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c -SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - -DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S - -ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) - -ifeq ($(DGEMM_UNROLL_M), 8) -DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S -DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S -else -DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c -DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c -endif - -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif - -ifeq ($(DGEMM_UNROLL_N), 4) -DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S -DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S -else -DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c -DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c -endif - -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - -CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S -ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) -CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c -CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S -ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) -ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c -ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) - -SASUMKERNEL = sasum_thunderx2t99.c -DASUMKERNEL = dasum_thunderx2t99.c -CASUMKERNEL = casum_thunderx2t99.c -ZASUMKERNEL = zasum_thunderx2t99.c - -SCOPYKERNEL = copy_thunderx2t99.c -DCOPYKERNEL = copy_thunderx2t99.c -CCOPYKERNEL = copy_thunderx2t99.c -ZCOPYKERNEL = copy_thunderx2t99.c - -SSWAPKERNEL = swap_thunderx2t99.S -DSWAPKERNEL = swap_thunderx2t99.S -CSWAPKERNEL = swap_thunderx2t99.S -ZSWAPKERNEL = swap_thunderx2t99.S - -ISAMAXKERNEL = iamax_thunderx2t99.c -IDAMAXKERNEL = iamax_thunderx2t99.c -ICAMAXKERNEL = izamax_thunderx2t99.c -IZAMAXKERNEL = izamax_thunderx2t99.c - -SNRM2KERNEL = scnrm2_thunderx2t99.c -CNRM2KERNEL = scnrm2_thunderx2t99.c -#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c -#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c -DNRM2KERNEL = dznrm2_thunderx2t99.c -ZNRM2KERNEL = dznrm2_thunderx2t99.c - - -DDOTKERNEL = dot.c -SDOTKERNEL = dot.c -CDOTKERNEL = zdot_thunderx2t99.c -ZDOTKERNEL = zdot_thunderx2t99.c -DSDOTKERNEL = dot.S - -ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4) -DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S -endif - -ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4) -SGEMMKERNEL = sgemm_kernel_16x4_thunderx2t99.S -endif - -ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4) -CGEMMKERNEL = cgemm_kernel_8x4_thunderx2t99.S -endif - -ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4) -ZGEMMKERNEL = zgemm_kernel_4x4_thunderx2t99.S -endif +include $(KERNELDIR)/KERNEL.THUNDERX2T99 diff --git a/kernel/arm64/dot_kernel_sve.c b/kernel/arm64/dot_kernel_sve.c index 9c057551e..16f4cd537 100644 --- a/kernel/arm64/dot_kernel_sve.c +++ b/kernel/arm64/dot_kernel_sve.c @@ -1,4 +1,5 @@ /*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project Copyright (c) 2022, Arm Ltd All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,37 +31,84 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #ifdef DOUBLE -#define SVE_TYPE svfloat64_t -#define SVE_ZERO svdup_f64(0.0) -#define SVE_WHILELT svwhilelt_b64 -#define SVE_ALL svptrue_b64() -#define SVE_WIDTH svcntd() +#define DTYPE "d" +#define WIDTH "d" +#define SHIFT "3" #else -#define SVE_TYPE svfloat32_t -#define SVE_ZERO svdup_f32(0.0) -#define SVE_WHILELT svwhilelt_b32 -#define SVE_ALL svptrue_b32() -#define SVE_WIDTH svcntw() +#define DTYPE "s" +#define WIDTH "w" +#define SHIFT "2" #endif -static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) { - SVE_TYPE acc_a = SVE_ZERO; - SVE_TYPE acc_b = SVE_ZERO; +#define COUNT \ +" cnt"WIDTH" x9 \n" +#define SETUP_TRUE \ +" ptrue p0."DTYPE" \n" +#define OFFSET_INPUTS \ +" add x12, %[X_], x9, lsl #"SHIFT" \n" \ +" add x13, %[Y_], x9, lsl #"SHIFT" \n" +#define TAIL_WHILE \ +" whilelo p1."DTYPE", x8, x0 \n" +#define UPDATE(pg, x,y,out) \ +" ld1"WIDTH" { z2."DTYPE" }, "pg"/z, ["x", x8, lsl #"SHIFT"] \n" \ +" ld1"WIDTH" { z3."DTYPE" }, "pg"/z, ["y", x8, lsl #"SHIFT"] \n" \ +" fmla "out"."DTYPE", "pg"/m, z2."DTYPE", z3."DTYPE" \n" +#define SUM_VECTOR(v) \ +" faddv "DTYPE""v", p0, z"v"."DTYPE" \n" +#define RET \ +" fadd %"DTYPE"[RET_], "DTYPE"1, "DTYPE"0 \n" - BLASLONG sve_width = SVE_WIDTH; +#define DOT_KERNEL \ + COUNT \ +" mov z1.d, #0 \n" \ +" mov z0.d, #0 \n" \ +" mov x8, #0 \n" \ +" movi d1, #0x0 \n" \ + SETUP_TRUE \ +" neg x10, x9, lsl #1 \n" \ +" ands x11, x10, x0 \n" \ +" b.eq 2f // skip_2x \n" \ + OFFSET_INPUTS \ +"1: // vector_2x \n" \ + UPDATE("p0", "%[X_]", "%[Y_]", "z1") \ + UPDATE("p0", "x12", "x13", "z0") \ +" sub x8, x8, x10 \n" \ +" cmp x8, x11 \n" \ +" b.lo 1b // vector_2x \n" \ + SUM_VECTOR("1") \ +"2: // skip_2x \n" \ +" neg x10, x9 \n" \ +" and x10, x10, x0 \n" \ +" cmp x8, x10 \n" \ +" b.hs 4f // tail \n" \ +"3: // vector_1x \n" \ + UPDATE("p0", "%[X_]", "%[Y_]", "z0") \ +" add x8, x8, x9 \n" \ +" cmp x8, x10 \n" \ +" b.lo 3b // vector_1x \n" \ +"4: // tail \n" \ +" cmp x10, x0 \n" \ +" b.eq 5f // end \n" \ + TAIL_WHILE \ + UPDATE("p1", "%[X_]", "%[Y_]", "z0") \ +"5: // end \n" \ + SUM_VECTOR("0") \ + RET - for (BLASLONG i = 0; i < n; i += sve_width * 2) { - svbool_t pg_a = SVE_WHILELT((uint64_t)i, (uint64_t)n); - svbool_t pg_b = SVE_WHILELT((uint64_t)(i + sve_width), (uint64_t)n); +static +FLOAT +dot_kernel_sve(BLASLONG n, FLOAT* x, FLOAT* y) +{ + FLOAT ret; - SVE_TYPE x_vec_a = svld1(pg_a, &x[i]); - SVE_TYPE y_vec_a = svld1(pg_a, &y[i]); - SVE_TYPE x_vec_b = svld1(pg_b, &x[i + sve_width]); - SVE_TYPE y_vec_b = svld1(pg_b, &y[i + sve_width]); + asm(DOT_KERNEL + : + [RET_] "=&w" (ret) + : + [N_] "r" (n), + [X_] "r" (x), + [Y_] "r" (y) + :); - acc_a = svmla_m(pg_a, acc_a, x_vec_a, y_vec_a); - acc_b = svmla_m(pg_b, acc_b, x_vec_b, y_vec_b); - } - - return svaddv(SVE_ALL, acc_a) + svaddv(SVE_ALL, acc_b); + return ret; } diff --git a/kernel/arm64/zscal.S b/kernel/arm64/zscal.S index 929455975..4bd43320d 100644 --- a/kernel/arm64/zscal.S +++ b/kernel/arm64/zscal.S @@ -223,7 +223,7 @@ zscal_begin: fcmp DA_I, #0.0 beq .Lzscal_kernel_RI_zero - b .Lzscal_kernel_R_zero +// b .Lzscal_kernel_R_zero .Lzscal_kernel_R_non_zero: diff --git a/kernel/csky/KERNEL b/kernel/csky/KERNEL new file mode 100644 index 000000000..afa8a0881 --- /dev/null +++ b/kernel/csky/KERNEL @@ -0,0 +1,149 @@ +SAMAXKERNEL = ../arm/amax.c +DAMAXKERNEL = ../arm/amax.c +CAMAXKERNEL = ../arm/zamax.c +ZAMAXKERNEL = ../arm/zamax.c + +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMAXKERNEL = ../arm/iamax.c +IDAMAXKERNEL = ../arm/iamax.c +ICAMAXKERNEL = ../arm/izamax.c +IZAMAXKERNEL = ../arm/izamax.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +SASUMKERNEL = ../arm/asum.c +DASUMKERNEL = ../arm/asum.c +CASUMKERNEL = ../arm/zasum.c +ZASUMKERNEL = ../arm/zasum.c + +SSUMKERNEL = ../arm/sum.c +DSUMKERNEL = ../arm/sum.c +CSUMKERNEL = ../arm/zsum.c +ZSUMKERNEL = ../arm/zsum.c + +SAXPYKERNEL = ../arm/axpy.c +DAXPYKERNEL = ../arm/axpy.c +CAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = ../arm/zaxpy.c + +SCOPYKERNEL = ../arm/copy.c +DCOPYKERNEL = ../arm/copy.c +CCOPYKERNEL = ../arm/zcopy.c +ZCOPYKERNEL = ../arm/zcopy.c + +SDOTKERNEL = ../arm/dot.c +DDOTKERNEL = ../arm/dot.c +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +DSDOTKERNEL = ../generic/dot.c + +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c + +SROTKERNEL = ../arm/rot.c +DROTKERNEL = ../arm/rot.c +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c + +SSCALKERNEL = ../arm/scal.c +DSCALKERNEL = ../arm/scal.c +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c + +SSWAPKERNEL = ../arm/swap.c +DSWAPKERNEL = ../arm/swap.c +CSWAPKERNEL = ../arm/zswap.c +ZSWAPKERNEL = ../arm/zswap.c + +SGEMVNKERNEL = ../arm/gemv_n.c +DGEMVNKERNEL = ../arm/gemv_n.c +CGEMVNKERNEL = ../arm/zgemv_n.c +ZGEMVNKERNEL = ../arm/zgemv_n.c + +SGEMVTKERNEL = ../arm/gemv_t.c +DGEMVTKERNEL = ../arm/gemv_t.c +CGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = ../arm/zgemv_t.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c +LSAME_KERNEL = ../generic/lsame.c + +SGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = ../generic/gemm_beta.c +CGEMM_BETA = ../generic/zgemm_beta.c +ZGEMM_BETA = ../generic/zgemm_beta.c + + diff --git a/kernel/csky/Makefile b/kernel/csky/Makefile new file mode 100644 index 000000000..520349bd6 --- /dev/null +++ b/kernel/csky/Makefile @@ -0,0 +1 @@ +clean :: diff --git a/kernel/generic/trmmkernel_16x8.c b/kernel/generic/trmmkernel_16x8.c new file mode 100644 index 000000000..5412eab70 --- /dev/null +++ b/kernel/generic/trmmkernel_16x8.c @@ -0,0 +1,3676 @@ +#include "common.h" + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb; + + FLOAT res0_0; + FLOAT res0_1; + FLOAT res0_2; + FLOAT res0_3; + FLOAT res0_4; + FLOAT res0_5; + FLOAT res0_6; + FLOAT res0_7; + + FLOAT res0_8; + FLOAT res0_9; + FLOAT res0_10; + FLOAT res0_11; + FLOAT res0_12; + FLOAT res0_13; + FLOAT res0_14; + FLOAT res0_15; + + FLOAT res1_0; + FLOAT res1_1; + FLOAT res1_2; + FLOAT res1_3; + FLOAT res1_4; + FLOAT res1_5; + FLOAT res1_6; + FLOAT res1_7; + + FLOAT res1_8; + FLOAT res1_9; + FLOAT res1_10; + FLOAT res1_11; + FLOAT res1_12; + FLOAT res1_13; + FLOAT res1_14; + FLOAT res1_15; + + FLOAT res2_0; + FLOAT res2_1; + FLOAT res2_2; + FLOAT res2_3; + FLOAT res2_4; + FLOAT res2_5; + FLOAT res2_6; + FLOAT res2_7; + + FLOAT res2_8; + FLOAT res2_9; + FLOAT res2_10; + FLOAT res2_11; + FLOAT res2_12; + FLOAT res2_13; + FLOAT res2_14; + FLOAT res2_15; + + FLOAT res3_0; + FLOAT res3_1; + FLOAT res3_2; + FLOAT res3_3; + FLOAT res3_4; + FLOAT res3_5; + FLOAT res3_6; + FLOAT res3_7; + + FLOAT res3_8; + FLOAT res3_9; + FLOAT res3_10; + FLOAT res3_11; + FLOAT res3_12; + FLOAT res3_13; + FLOAT res3_14; + FLOAT res3_15; + + FLOAT res4_0; + FLOAT res4_1; + FLOAT res4_2; + FLOAT res4_3; + FLOAT res4_4; + FLOAT res4_5; + FLOAT res4_6; + FLOAT res4_7; + + FLOAT res4_8; + FLOAT res4_9; + FLOAT res4_10; + FLOAT res4_11; + FLOAT res4_12; + FLOAT res4_13; + FLOAT res4_14; + FLOAT res4_15; + + FLOAT res5_0; + FLOAT res5_1; + FLOAT res5_2; + FLOAT res5_3; + FLOAT res5_4; + FLOAT res5_5; + FLOAT res5_6; + FLOAT res5_7; + + FLOAT res5_8; + FLOAT res5_9; + FLOAT res5_10; + FLOAT res5_11; + FLOAT res5_12; + FLOAT res5_13; + FLOAT res5_14; + FLOAT res5_15; + + FLOAT res6_0; + FLOAT res6_1; + FLOAT res6_2; + FLOAT res6_3; + FLOAT res6_4; + FLOAT res6_5; + FLOAT res6_6; + FLOAT res6_7; + + FLOAT res6_8; + FLOAT res6_9; + FLOAT res6_10; + FLOAT res6_11; + FLOAT res6_12; + FLOAT res6_13; + FLOAT res6_14; + FLOAT res6_15; + + FLOAT res7_0; + FLOAT res7_1; + FLOAT res7_2; + FLOAT res7_3; + FLOAT res7_4; + FLOAT res7_5; + FLOAT res7_6; + FLOAT res7_7; + + FLOAT res7_8; + FLOAT res7_9; + FLOAT res7_10; + FLOAT res7_11; + FLOAT res7_12; + FLOAT res7_13; + FLOAT res7_14; + FLOAT res7_15; + + FLOAT a0; + FLOAT a1; + + FLOAT b0; + FLOAT b1; + FLOAT b2; + FLOAT b3; + FLOAT b4; + FLOAT b5; + FLOAT b6; + FLOAT b7; + + BLASLONG off, temp; + +#if !defined(LEFT) + off = -offset; +#else + off = 0; +#endif + + for (j=0; j +#include "common.h" + +#define a2 (a1 + 2) +#define a4 (a3 + 2) +#define a6 (a5 + 2) +#define a8 (a7 + 2) + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){ + + BLASLONG i, j, ip1, ip2; + blasint *piv; + FLOAT *a1, *a3, *a5, *a7; + FLOAT *b1, *b2, *b3, *b4; + FLOAT *b5, *b6, *b7, *b8; + FLOAT A1, A2, A3, A4, A5, A6, A7, A8; + FLOAT B1, B2, B3, B4, B5, B6, B7, B8; + + FLOAT A9, A10, A11, A12, A13, A14, A15, A16; + FLOAT B9, B10, B11, B12, B13, B14, B15, B16; + + a -= 2; + lda *= 2; + k1 --; + + ipiv += k1; + + if (n <= 0) return 0; + + j = (n >> 3); + if (j > 0) { + do { + piv = ipiv; + + a1 = a + (k1 + 1) * 2; + + a3 = a1 + 1 * lda; + a5 = a1 + 2 * lda; + a7 = a1 + 3 * lda; + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + for( int pass = 0; pass < 2; ++pass ) { + A1 = *(a1 + 0); + A9 = *(a1 + 1); + A2 = *(a2 + 0); + A10 = *(a2 + 1); + A3 = *(a3 + 0); + A11 = *(a3 + 1); + A4 = *(a4 + 0); + A12 = *(a4 + 1); + A5 = *(a5 + 0); + A13 = *(a5 + 1); + A6 = *(a6 + 0); + A14 = *(a6 + 1); + A7 = *(a7 + 0); + A15 = *(a7 + 1); + A8 = *(a8 + 0); + A16 = *(a8 + 1); + + B1 = *(b1 + 0); + B9 = *(b1 + 1); + B2 = *(b2 + 0); + B10 = *(b2 + 1); + B3 = *(b3 + 0); + B11 = *(b3 + 1); + B4 = *(b4 + 0); + B12 = *(b4 + 1); + B5 = *(b5 + 0); + B13 = *(b5 + 1); + B6 = *(b6 + 0); + B14 = *(b6 + 1); + B7 = *(b7 + 0); + B15 = *(b7 + 1); + B8 = *(b8 + 0); + B16 = *(b8 + 1); + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + + *(buffer + 8) = A2; + *(buffer + 9) = A10; + *(buffer + 10) = A4; + *(buffer + 11) = A12; + *(buffer + 12) = A6; + *(buffer + 13) = A14; + *(buffer + 14) = A8; + *(buffer + 15) = A16; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b2 + 0) = A2; + *(b2 + 1) = A10; + *(b4 + 0) = A4; + *(b4 + 1) = A12; + *(b6 + 0) = A6; + *(b6 + 1) = A14; + *(b8 + 0) = A8; + *(b8 + 1) = A16; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A2; + *(buffer + 1) = A10; + *(buffer + 2) = A4; + *(buffer + 3) = A12; + *(buffer + 4) = A6; + *(buffer + 5) = A14; + *(buffer + 6) = A8; + *(buffer + 7) = A16; + *(buffer + 8) = A1; + *(buffer + 9) = A9; + *(buffer + 10) = A3; + *(buffer + 11) = A11; + *(buffer + 12) = A5; + *(buffer + 13) = A13; + *(buffer + 14) = A7; + *(buffer + 15) = A15; + + } else { + *(buffer + 0) = A2; + *(buffer + 1) = A10; + *(buffer + 2) = A4; + *(buffer + 3) = A12; + *(buffer + 4) = A6; + *(buffer + 5) = A14; + *(buffer + 6) = A8; + *(buffer + 7) = A16; + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b2 + 0) = A1; + *(b2 + 1) = A9; + *(b4 + 0) = A3; + *(b4 + 1) = A11; + *(b6 + 0) = A5; + *(b6 + 1) = A13; + *(b8 + 0) = A7; + *(b8 + 1) = A15; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = A2; + *(buffer + 9) = A10; + *(buffer + 10) = A4; + *(buffer + 11) = A12; + *(buffer + 12) = A6; + *(buffer + 13) = A14; + *(buffer + 14) = A8; + *(buffer + 15) = A16; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = A1; + *(buffer + 9) = A9; + *(buffer + 10) = A3; + *(buffer + 11) = A11; + *(buffer + 12) = A5; + *(buffer + 13) = A13; + *(buffer + 14) = A7; + *(buffer + 15) = A15; + + *(b1 + 0) = A2; + *(b1 + 1) = A10; + *(b3 + 0) = A4; + *(b3 + 1) = A12; + *(b5 + 0) = A6; + *(b5 + 1) = A14; + *(b7 + 0) = A8; + *(b7 + 1) = A16; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b2 + 0) = A2; + *(b2 + 1) = A10; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b4 + 0) = A4; + *(b4 + 1) = A12; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b6 + 0) = A6; + *(b6 + 1) = A14; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + *(b8 + 0) = A8; + *(b8 + 1) = A16; + } + } + b1 += 4*lda; + b2 += 4*lda; + b3 += 4*lda; + b4 += 4*lda; + b5 += 4*lda; + b6 += 4*lda; + b7 += 4*lda; + b8 += 4*lda; + + a1 += 4; + a3 += 4; + a5 += 4; + a7 += 4; + + buffer += 16; + } + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A9 = *(a1 + 1); + B1 = *(b1 + 0); + B9 = *(b1 + 1); + A3 = *(a3 + 0); + A11 = *(a3 + 1); + B3 = *(b3 + 0); + B11 = *(b3 + 1); + A5 = *(a5 + 0); + A13 = *(a5 + 1); + B5 = *(b5 + 0); + B13 = *(b5 + 1); + A7 = *(a7 + 0); + A15 = *(a7 + 1); + B7 = *(b7 + 0); + B15 = *(b7 + 1); + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + } + buffer += 8; + } + + a += 4 * lda; + + j --; + } while (j > 0); + } + + + if (n & 4) { + { + piv = ipiv; + + a1 = a + (k1 + 1) * 2; + + a3 = a1 + 1 * lda; + a5 = a1 + 2 * lda; + a7 = a1 + 3 * lda; + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A9 = *(a1 + 1); + A2 = *(a2 + 0); + A10 = *(a2 + 1); + A3 = *(a3 + 0); + A11 = *(a3 + 1); + A4 = *(a4 + 0); + A12 = *(a4 + 1); + A5 = *(a5 + 0); + A13 = *(a5 + 1); + A6 = *(a6 + 0); + A14 = *(a6 + 1); + A7 = *(a7 + 0); + A15 = *(a7 + 1); + A8 = *(a8 + 0); + A16 = *(a8 + 1); + + B1 = *(b1 + 0); + B9 = *(b1 + 1); + B2 = *(b2 + 0); + B10 = *(b2 + 1); + B3 = *(b3 + 0); + B11 = *(b3 + 1); + B4 = *(b4 + 0); + B12 = *(b4 + 1); + B5 = *(b5 + 0); + B13 = *(b5 + 1); + B6 = *(b6 + 0); + B14 = *(b6 + 1); + B7 = *(b7 + 0); + B15 = *(b7 + 1); + B8 = *(b8 + 0); + B16 = *(b8 + 1); + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + + *(buffer + 8) = A2; + *(buffer + 9) = A10; + *(buffer + 10) = A4; + *(buffer + 11) = A12; + *(buffer + 12) = A6; + *(buffer + 13) = A14; + *(buffer + 14) = A8; + *(buffer + 15) = A16; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b2 + 0) = A2; + *(b2 + 1) = A10; + *(b4 + 0) = A4; + *(b4 + 1) = A12; + *(b6 + 0) = A6; + *(b6 + 1) = A14; + *(b8 + 0) = A8; + *(b8 + 1) = A16; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A2; + *(buffer + 1) = A10; + *(buffer + 2) = A4; + *(buffer + 3) = A12; + *(buffer + 4) = A6; + *(buffer + 5) = A14; + *(buffer + 6) = A8; + *(buffer + 7) = A16; + *(buffer + 8) = A1; + *(buffer + 9) = A9; + *(buffer + 10) = A3; + *(buffer + 11) = A11; + *(buffer + 12) = A5; + *(buffer + 13) = A13; + *(buffer + 14) = A7; + *(buffer + 15) = A15; + + } else { + *(buffer + 0) = A2; + *(buffer + 1) = A10; + *(buffer + 2) = A4; + *(buffer + 3) = A12; + *(buffer + 4) = A6; + *(buffer + 5) = A14; + *(buffer + 6) = A8; + *(buffer + 7) = A16; + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b2 + 0) = A1; + *(b2 + 1) = A9; + *(b4 + 0) = A3; + *(b4 + 1) = A11; + *(b6 + 0) = A5; + *(b6 + 1) = A13; + *(b8 + 0) = A7; + *(b8 + 1) = A15; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = A2; + *(buffer + 9) = A10; + *(buffer + 10) = A4; + *(buffer + 11) = A12; + *(buffer + 12) = A6; + *(buffer + 13) = A14; + *(buffer + 14) = A8; + *(buffer + 15) = A16; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = A1; + *(buffer + 9) = A9; + *(buffer + 10) = A3; + *(buffer + 11) = A11; + *(buffer + 12) = A5; + *(buffer + 13) = A13; + *(buffer + 14) = A7; + *(buffer + 15) = A15; + + *(b1 + 0) = A2; + *(b1 + 1) = A10; + *(b3 + 0) = A4; + *(b3 + 1) = A12; + *(b5 + 0) = A6; + *(b5 + 1) = A14; + *(b7 + 0) = A8; + *(b7 + 1) = A16; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b2 + 0) = A2; + *(b2 + 1) = A10; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b4 + 0) = A4; + *(b4 + 1) = A12; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b6 + 0) = A6; + *(b6 + 1) = A14; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + *(b8 + 0) = A8; + *(b8 + 1) = A16; + } + } + + buffer += 16; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + a1 += 4; + a3 += 4; + a5 += 4; + a7 += 4; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A9 = *(a1 + 1); + B1 = *(b1 + 0); + B9 = *(b1 + 1); + A3 = *(a3 + 0); + A11 = *(a3 + 1); + B3 = *(b3 + 0); + B11 = *(b3 + 1); + A5 = *(a5 + 0); + A13 = *(a5 + 1); + B5 = *(b5 + 0); + B13 = *(b5 + 1); + A7 = *(a7 + 0); + A15 = *(a7 + 1); + B7 = *(b7 + 0); + B15 = *(b7 + 1); + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + } + buffer += 8; + } + + a += 4 * lda; + } + } //if (n & 4) + + if (n & 2) { + piv = ipiv; + + a1 = a + (k1 + 1) * 2; + a3 = a1 + lda; + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + lda; + b4 = b2 + lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + A5 = *(a3 + 0); + A6 = *(a3 + 1); + A7 = *(a4 + 0); + A8 = *(a4 + 1); + + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + B5 = *(b3 + 0); + B6 = *(b3 + 1); + B7 = *(b4 + 0); + B8 = *(b4 + 1); + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A5; + *(buffer + 3) = A6; + *(buffer + 4) = A3; + *(buffer + 5) = A4; + *(buffer + 6) = A7; + *(buffer + 7) = A8; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A5; + *(buffer + 3) = A6; + *(buffer + 4) = B3; + *(buffer + 5) = B4; + *(buffer + 6) = B7; + *(buffer + 7) = B8; + + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } else { + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = A7; + *(buffer + 3) = A8; + *(buffer + 4) = A1; + *(buffer + 5) = A2; + *(buffer + 6) = A5; + *(buffer + 7) = A6; + } else { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = A7; + *(buffer + 3) = A8; + *(buffer + 4) = B3; + *(buffer + 5) = B4; + *(buffer + 6) = B7; + *(buffer + 7) = B8; + + *(b2 + 0) = A1; + *(b2 + 1) = A2; + *(b4 + 0) = A5; + *(b4 + 1) = A6; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B5; + *(buffer + 3) = B6; + *(buffer + 4) = A3; + *(buffer + 5) = A4; + *(buffer + 6) = A7; + *(buffer + 7) = A8; + + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + } else { + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B5; + *(buffer + 3) = B6; + *(buffer + 4) = A1; + *(buffer + 5) = A2; + *(buffer + 6) = A5; + *(buffer + 7) = A6; + + *(b1 + 0) = A3; + *(b1 + 1) = A4; + *(b3 + 0) = A7; + *(b3 + 1) = A8; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B5; + *(buffer + 3) = B6; + *(buffer + 4) = B3; + *(buffer + 5) = B4; + *(buffer + 6) = B7; + *(buffer + 7) = B8; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } + } + } + + buffer += 8; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + lda; + b4 = b2 + lda; + + a1 += 4; + a3 += 4; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + A3 = *(a3 + 0); + A4 = *(a3 + 1); + B3 = *(b3 + 0); + B4 = *(b3 + 1); + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A3; + *(buffer + 3) = A4; + + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b3 + 0) = A3; + *(b3 + 1) = A4; + } + buffer += 4; + } + + a += 2 * lda; + } + + if (n & 1) { + piv = ipiv; + + a1 = a + (k1 + 1) * 2; + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A3; + *(buffer + 3) = A4; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = A1; + *(buffer + 3) = A2; + } else { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = A3; + *(buffer + 3) = A4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = A1; + *(buffer + 3) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } + + buffer += 4; + + b1 = a + ip1; + b2 = a + ip2; + + a1 += 4; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } + // buffer += 2; + } + } + + return 0; +} + diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 1e4fa7a9d..c7ef44035 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -3,56 +3,126 @@ ifndef NO_LSX SDOTKERNEL = dot_lsx.S DSDOTKERNEL = dot_lsx.S DDOTKERNEL = dot_lsx.S +CDOTKERNEL = cdot_lsx.S +ZDOTKERNEL = cdot_lsx.S -SSCALKERNEL = sscal_lsx.S -DSCALKERNEL = dscal_lsx.S +SSCALKERNEL = scal_lsx.S +DSCALKERNEL = scal_lsx.S +CSCALKERNEL = cscal_lsx.S +ZSCALKERNEL = cscal_lsx.S -SAMAXKERNEL = samax_lsx.S -DAMAXKERNEL = damax_lsx.S +SAMAXKERNEL = amax_lsx.S +DAMAXKERNEL = amax_lsx.S +CAMAXKERNEL = camax_lsx.S +ZAMAXKERNEL = camax_lsx.S -SAMINKERNEL = samin_lsx.S -DAMINKERNEL = damin_lsx.S +SAMINKERNEL = amin_lsx.S +DAMINKERNEL = amin_lsx.S +CAMINKERNEL = camin_lsx.S +ZAMINKERNEL = camin_lsx.S -SMAXKERNEL = smax_lsx.S -DMAXKERNEL = dmax_lsx.S +SMAXKERNEL = max_lsx.S +DMAXKERNEL = max_lsx.S -SMINKERNEL = smin_lsx.S -DMINKERNEL = dmin_lsx.S +SMINKERNEL = min_lsx.S +DMINKERNEL = min_lsx.S -ISMAXKERNEL = ismax_lsx.S -IDMAXKERNEL = idmax_lsx.S +ISMAXKERNEL = imax_lsx.S +IDMAXKERNEL = imax_lsx.S -ISMINKERNEL = ismin_lsx.S -IDMINKERNEL = idmin_lsx.S +ISMINKERNEL = imin_lsx.S +IDMINKERNEL = imin_lsx.S -ISAMAXKERNEL = isamax_lsx.S -IDAMAXKERNEL = idamax_lsx.S +ISAMAXKERNEL = iamax_lsx.S +IDAMAXKERNEL = iamax_lsx.S +ICAMAXKERNEL = icamax_lsx.S +IZAMAXKERNEL = icamax_lsx.S -ISAMINKERNEL = isamin_lsx.S -IDAMINKERNEL = idamin_lsx.S +ISAMINKERNEL = iamin_lsx.S +IDAMINKERNEL = iamin_lsx.S +ICAMINKERNEL = icamin_lsx.S +IZAMINKERNEL = icamin_lsx.S -SCOPYKERNEL = scopy_lsx.S -DCOPYKERNEL = dcopy_lsx.S +SCOPYKERNEL = copy_lsx.S +DCOPYKERNEL = copy_lsx.S +CCOPYKERNEL = ccopy_lsx.S +ZCOPYKERNEL = ccopy_lsx.S -SSWAPKERNEL = sswap_lsx.S -DSWAPKERNEL = dswap_lsx.S +SSWAPKERNEL = swap_lsx.S +DSWAPKERNEL = swap_lsx.S -SAXPYKERNEL = saxpy_lsx.S -DAXPYKERNEL = daxpy_lsx.S +SAXPYKERNEL = axpy_lsx.S +DAXPYKERNEL = axpy_lsx.S +CAXPYKERNEL = caxpy_lsx.S +ZAXPYKERNEL = caxpy_lsx.S -SAXPBYKERNEL = saxpby_lsx.S -DAXPBYKERNEL = daxpby_lsx.S +SAXPBYKERNEL = axpby_lsx.S +DAXPBYKERNEL = axpby_lsx.S +CAXPBYKERNEL = caxpby_lsx.S +ZAXPBYKERNEL = caxpby_lsx.S -SSUMKERNEL = ssum_lsx.S -DSUMKERNEL = dsum_lsx.S +SSUMKERNEL = sum_lsx.S +DSUMKERNEL = sum_lsx.S -SASUMKERNEL = sasum_lsx.S -DASUMKERNEL = dasum_lsx.S +SASUMKERNEL = asum_lsx.S +DASUMKERNEL = asum_lsx.S +CASUMKERNEL = casum_lsx.S +ZASUMKERNEL = casum_lsx.S -SROTKERNEL = srot_lsx.S -DROTKERNEL = drot_lsx.S +SROTKERNEL = rot_lsx.S +DROTKERNEL = rot_lsx.S +CROTKERNEL = crot_lsx.S +ZROTKERNEL = crot_lsx.S SNRM2KERNEL = snrm2_lsx.S DNRM2KERNEL = dnrm2_lsx.S - +CNRM2KERNEL = cnrm2_lsx.S +ZNRM2KERNEL = znrm2_lsx.S + +CSWAPKERNEL = cswap_lsx.S +ZSWAPKERNEL = cswap_lsx.S + +CSUMKERNEL = csum_lsx.S +ZSUMKERNEL = csum_lsx.S + +DGEMMKERNEL = dgemm_kernel_8x4.S +DGEMMINCOPY = dgemm_ncopy_8_lsx.S +DGEMMITCOPY = dgemm_tcopy_8_lsx.S +DGEMMONCOPY = dgemm_ncopy_4_lsx.S +DGEMMOTCOPY = dgemm_tcopy_4_lsx.S +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CGEMMKERNEL = cgemm_kernel_8x4_lsx.S +CGEMMINCOPY = cgemm_ncopy_8_lsx.S +CGEMMITCOPY = cgemm_tcopy_8_lsx.S +CGEMMONCOPY = cgemm_ncopy_4_lsx.S +CGEMMOTCOPY = cgemm_tcopy_4_lsx.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZGEMMKERNEL = zgemm_kernel_4x4_lsx.S +ZGEMMONCOPY = zgemm_ncopy_4_lsx.S +ZGEMMOTCOPY = zgemm_tcopy_4_lsx.S +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index f00abcb32..17d15656a 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -3,57 +3,87 @@ ifndef NO_LASX SDOTKERNEL = dot_lasx.S DSDOTKERNEL = dot_lasx.S DDOTKERNEL = dot_lasx.S +CDOTKERNEL = cdot_lasx.S +ZDOTKERNEL = cdot_lasx.S -SSCALKERNEL = sscal_lasx.S -DSCALKERNEL = dscal_lasx.S +SSCALKERNEL = scal_lasx.S +DSCALKERNEL = scal_lasx.S +CSCALKERNEL = cscal_lasx.S +ZSCALKERNEL = cscal_lasx.S -SAMAXKERNEL = samax_lasx.S -DAMAXKERNEL = damax_lasx.S +SAMAXKERNEL = amax_lasx.S +DAMAXKERNEL = amax_lasx.S +CAMAXKERNEL = camax_lasx.S +ZAMAXKERNEL = camax_lasx.S -SAMINKERNEL = samin_lasx.S -DAMINKERNEL = damin_lasx.S +SAMINKERNEL = amin_lasx.S +DAMINKERNEL = amin_lasx.S +CAMINKERNEL = camin_lasx.S +ZAMINKERNEL = camin_lasx.S -SMAXKERNEL = smax_lasx.S -DMAXKERNEL = dmax_lasx.S +SMAXKERNEL = max_lsx.S +DMAXKERNEL = max_lsx.S -SMINKERNEL = smin_lasx.S -DMINKERNEL = dmin_lasx.S +SMINKERNEL = min_lsx.S +DMINKERNEL = min_lsx.S -ISMAXKERNEL = ismax_lasx.S -IDMAXKERNEL = idmax_lasx.S +ISMAXKERNEL = imax_lasx.S +IDMAXKERNEL = imax_lasx.S -ISMINKERNEL = ismin_lasx.S -IDMINKERNEL = idmin_lasx.S +ISMINKERNEL = imin_lasx.S +IDMINKERNEL = imin_lasx.S -ISAMAXKERNEL = isamax_lasx.S -IDAMAXKERNEL = idamax_lasx.S +ISAMAXKERNEL = iamax_lasx.S +IDAMAXKERNEL = iamax_lasx.S +ICAMAXKERNEL = icamax_lasx.S +IZAMAXKERNEL = icamax_lasx.S -ISAMINKERNEL = isamin_lasx.S -IDAMINKERNEL = idamin_lasx.S +ISAMINKERNEL = iamin_lasx.S +IDAMINKERNEL = iamin_lasx.S +ICAMINKERNEL = icamin_lasx.S +IZAMINKERNEL = icamin_lasx.S -SCOPYKERNEL = scopy_lasx.S -DCOPYKERNEL = dcopy_lasx.S +SCOPYKERNEL = copy_lasx.S +DCOPYKERNEL = copy_lasx.S +CCOPYKERNEL = ccopy_lasx.S +ZCOPYKERNEL = ccopy_lasx.S -SSWAPKERNEL = sswap_lasx.S -DSWAPKERNEL = dswap_lasx.S +SSWAPKERNEL = swap_lasx.S +DSWAPKERNEL = swap_lasx.S -SAXPYKERNEL = saxpy_lasx.S -DAXPYKERNEL = daxpy_lasx.S +SAXPYKERNEL = axpy_lasx.S +DAXPYKERNEL = axpy_lasx.S +CAXPYKERNEL = caxpy_lasx.S +ZAXPYKERNEL = caxpy_lasx.S -SAXPBYKERNEL = saxpby_lasx.S -DAXPBYKERNEL = daxpby_lasx.S +SAXPBYKERNEL = axpby_lasx.S +DAXPBYKERNEL = axpby_lasx.S +CAXPBYKERNEL = caxpby_lasx.S +ZAXPBYKERNEL = caxpby_lasx.S -SSUMKERNEL = ssum_lasx.S -DSUMKERNEL = dsum_lasx.S +SSUMKERNEL = sum_lasx.S +DSUMKERNEL = sum_lasx.S -SASUMKERNEL = sasum_lasx.S -DASUMKERNEL = dasum_lasx.S +SASUMKERNEL = asum_lasx.S +DASUMKERNEL = asum_lasx.S +CASUMKERNEL = casum_lasx.S +ZASUMKERNEL = casum_lasx.S -SROTKERNEL = srot_lasx.S -DROTKERNEL = drot_lasx.S +SROTKERNEL = rot_lasx.S +DROTKERNEL = rot_lasx.S +CROTKERNEL = crot_lasx.S +ZROTKERNEL = crot_lasx.S SNRM2KERNEL = snrm2_lasx.S DNRM2KERNEL = dnrm2_lasx.S +CNRM2KERNEL = cnrm2_lasx.S +ZNRM2KERNEL = znrm2_lasx.S + +CSWAPKERNEL = cswap_lasx.S +ZSWAPKERNEL = cswap_lasx.S + +CSUMKERNEL = csum_lasx.S +ZSUMKERNEL = csum_lasx.S DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S @@ -81,13 +111,39 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) SGEMVNKERNEL = sgemv_n_8_lasx.S SGEMVTKERNEL = sgemv_t_8_lasx.S +CGEMMKERNEL = cgemm_kernel_2x2_lsx.S +CGEMMONCOPY = cgemm_ncopy_2_lsx.S +CGEMMOTCOPY = cgemm_tcopy_2_lsx.S +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZGEMMKERNEL = zgemm_kernel_8x4_lasx.S +ZGEMMINCOPY = zgemm_ncopy_8_lasx.S +ZGEMMITCOPY = zgemm_tcopy_8_lasx.S +ZGEMMONCOPY = zgemm_ncopy_4_lasx.S +ZGEMMOTCOPY = zgemm_tcopy_4_lasx.S +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + DTRSMKERNEL_LN = dtrsm_kernel_LN_16x4_lasx.S DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_lasx.S DTRSMKERNEL_RN = dtrsm_kernel_RN_16x4_lasx.S DTRSMKERNEL_RT = dtrsm_kernel_RT_16x4_lasx.S -endif STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif diff --git a/kernel/loongarch64/amax_lasx.S b/kernel/loongarch64/amax_lasx.S new file mode 100644 index 000000000..e964d4ddb --- /dev/null +++ b/kernel/loongarch64/amax_lasx.S @@ -0,0 +1,232 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $xr0 +#define VM1 $xr1 +#define VM2 $xr2 +#define VX0 $xr3 +#define VX1 $xr4 +#define VX2 $xr5 +#define VX3 $xr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + xvldrepl.d VM0, X, 0 +#else + xvldrepl.w VM0, X, 0 +#endif + XVFSUB VM0, VM0, VM0 + bne INCX, TEMP, .L20 + + srai.d I, N, 4 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + xvld VX0, X, 0 + xvld VX1, X, 32 + xvld VX2, X, 64 + xvld VX3, X, 96 + addi.d I, I, -1 + addi.d X, X, 128 + XVFMAXA VM1, VX0, VX1 + XVFMAXA VM2, VX2, VX3 + XVFMAXA VM0, VM0, VM1 + XVFMAXA VM0, VM0, VM2 +#else + xvld VX0, X, 0 + xvld VX1, X, 32 + addi.d I, I, -1 + addi.d X, X, 64 + XVFMAXA VM1, VX0, VX1 + XVFMAXA VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMAXA VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMAXA VM1, VX0, VX1 + XVFMAXA VM2, VX2, VX3 + XVFMAXA VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 0x1 + XVFMAXA VM0, VM0, VM1 + .align 3 + +.L11: + andi I, N, 0x0f + bge $r0, I, .L13 + .align 3 + +.L12: /* 0 < N < 16 */ + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMAXA $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 0 + xvinsgr2vr.w VM1, t2, 1 + xvinsgr2vr.w VM1, t3, 2 + xvinsgr2vr.w VM1, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 4 + xvinsgr2vr.w VM1, t2, 5 + xvinsgr2vr.w VM1, t3, 6 + xvinsgr2vr.w VM1, t4, 7 + xvfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMAXA VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMAXA VM1, VX0, VX1 + XVFMAXA VM2, VX2, VX3 + XVFMAXA VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 1 + XVFMAXA VM0, VM0, VM1 + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: /* 0 < N < 8 */ + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMAXA $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/amax_lsx.S b/kernel/loongarch64/amax_lsx.S new file mode 100644 index 000000000..fb3b77a0e --- /dev/null +++ b/kernel/loongarch64/amax_lsx.S @@ -0,0 +1,231 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $vr0 +#define VM1 $vr1 +#define VM2 $vr2 +#define VX0 $vr3 +#define VX1 $vr4 +#define VX2 $vr5 +#define VX3 $vr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + vldrepl.d VM0, X, 0 +#else + vldrepl.w VM0, X, 0 +#endif + VFSUB VM0, VM0, VM0 + bne INCX, TEMP, .L20 + + srai.d I, N, 3 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + vld VX0, X, 0 + vld VX1, X, 16 + vld VX2, X, 32 + vld VX3, X, 48 + addi.d I, I, -1 + addi.d X, X, 64 + VFMAXA VM1, VX0, VX1 + VFMAXA VM2, VX2, VX3 + VFMAXA VM0, VM0, VM1 + VFMAXA VM0, VM0, VM2 +#else + vld VX0, X, 0 + vld VX1, X, 16 + addi.d I, I, -1 + addi.d X, X, 32 + VFMAXA VM1, VX0, VX1 + VFMAXA VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMAXA VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMAXA VM1, VX0, VX1 + VFMAXA VM2, VX2, VX3 + VFMAXA VM0, VM1, VM2 +#endif + .align 3 + +.L11: + andi I, N, 7 + bge $r0, I, .L13 + .align 3 + +.L12: + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMAXA $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM1, VX0, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM2, VX0, VX1 + vfmaxa.d VM1, VM1, VM2 + vfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmaxa.s VM1, VX0, VX1 + vfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMAXA VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMAXA VM1, VX0, VX1 + VFMAXA VM2, VX2, VX3 + VFMAXA VM0, VM1, VM2 +#endif + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMAXA $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/amin_lasx.S b/kernel/loongarch64/amin_lasx.S new file mode 100644 index 000000000..c91a33006 --- /dev/null +++ b/kernel/loongarch64/amin_lasx.S @@ -0,0 +1,231 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $xr0 +#define VM1 $xr1 +#define VM2 $xr2 +#define VX0 $xr3 +#define VX1 $xr4 +#define VX2 $xr5 +#define VX3 $xr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + xvldrepl.d VM0, X, 0 +#else + xvldrepl.w VM0, X, 0 +#endif + bne INCX, TEMP, .L20 + + srai.d I, N, 4 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + xvld VX0, X, 0 + xvld VX1, X, 32 + xvld VX2, X, 64 + xvld VX3, X, 96 + addi.d I, I, -1 + addi.d X, X, 128 + XVFMINA VM1, VX0, VX1 + XVFMINA VM2, VX2, VX3 + XVFMINA VM0, VM0, VM1 + XVFMINA VM0, VM0, VM2 +#else + xvld VX0, X, 0 + xvld VX1, X, 32 + addi.d I, I, -1 + addi.d X, X, 64 + XVFMINA VM1, VX0, VX1 + XVFMINA VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMINA VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMINA VM1, VX0, VX1 + XVFMINA VM2, VX2, VX3 + XVFMINA VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 0x1 + XVFMINA VM0, VM0, VM1 + .align 3 + +.L11: + andi I, N, 0x0f + bge $r0, I, .L13 + .align 3 + +.L12: /* 0 < N < 16 */ + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMINA $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 0 + xvinsgr2vr.w VM1, t2, 1 + xvinsgr2vr.w VM1, t3, 2 + xvinsgr2vr.w VM1, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 4 + xvinsgr2vr.w VM1, t2, 5 + xvinsgr2vr.w VM1, t3, 6 + xvinsgr2vr.w VM1, t4, 7 + xvfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMINA VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMINA VM1, VX0, VX1 + XVFMINA VM2, VX2, VX3 + XVFMINA VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 1 + XVFMINA VM0, VM0, VM1 + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: /* 0 < N < 8 */ + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMINA $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/amin_lsx.S b/kernel/loongarch64/amin_lsx.S new file mode 100644 index 000000000..47701b6e4 --- /dev/null +++ b/kernel/loongarch64/amin_lsx.S @@ -0,0 +1,230 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $vr0 +#define VM1 $vr1 +#define VM2 $vr2 +#define VX0 $vr3 +#define VX1 $vr4 +#define VX2 $vr5 +#define VX3 $vr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + vldrepl.d VM0, X, 0 +#else + vldrepl.w VM0, X, 0 +#endif + bne INCX, TEMP, .L20 + + srai.d I, N, 3 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + vld VX0, X, 0 + vld VX1, X, 16 + vld VX2, X, 32 + vld VX3, X, 48 + addi.d I, I, -1 + addi.d X, X, 64 + VFMINA VM1, VX0, VX1 + VFMINA VM2, VX2, VX3 + VFMINA VM0, VM0, VM1 + VFMINA VM0, VM0, VM2 +#else + vld VX0, X, 0 + vld VX1, X, 16 + addi.d I, I, -1 + addi.d X, X, 32 + VFMINA VM1, VX0, VX1 + VFMINA VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMINA VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMINA VM1, VX0, VX1 + VFMINA VM2, VX2, VX3 + VFMINA VM0, VM1, VM2 +#endif + .align 3 + +.L11: + andi I, N, 7 + bge $r0, I, .L13 + .align 3 + +.L12: + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMINA $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM1, VX0, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM2, VX0, VX1 + vfmaxa.d VM1, VM1, VM2 + vfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmaxa.s VM1, VX0, VX1 + vfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMINA VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMINA VM1, VX0, VX1 + VFMINA VM2, VX2, VX3 + VFMINA VM0, VM1, VM2 +#endif + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMINA $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/asum_lasx.S b/kernel/loongarch64/asum_lasx.S new file mode 100644 index 000000000..9a2c031f3 --- /dev/null +++ b/kernel/loongarch64/asum_lasx.S @@ -0,0 +1,257 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define VT0 $xr23 +#define VT1 $xr22 +#define res1 $xr16 +#define res2 $xr17 +#define res0 $xr18 +#define neg1 $xr19 + + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 +#ifdef DOUBLE + li.d t1, -1 + xvreplgr2vr.d neg1, t1 + xvffint.d.l neg1, neg1 +#else + li.w t1, -1 + xvreplgr2vr.w neg1, t1 + xvffint.s.w neg1, neg1 +#endif + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 +#else + xvld VX0, X, 0 * SIZE + xvfmul.s VX2, neg1, VX0 + xvfcmp.clt.s VT0, VX0, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvfadd.s res1, VX0, res1 +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD $f12, X, 0 * SIZE + FABS $f12, $f12 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfmul.s VX2, neg1, VX0 + xvfcmp.clt.s VT0, VX0, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvfadd.s res1, VX0, res1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f12, X, 0 * SIZE + FABS $f12, $f12 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/asum_lsx.S b/kernel/loongarch64/asum_lsx.S new file mode 100644 index 000000000..512b01404 --- /dev/null +++ b/kernel/loongarch64/asum_lsx.S @@ -0,0 +1,258 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define VT0 $vr23 +#define VT1 $vr22 +#define res1 $vr16 +#define res2 $vr17 +#define res0 $vr18 +#define neg1 $vr19 + + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 +#ifdef DOUBLE + li.d t1, -1 + vreplgr2vr.d neg1, t1 + vffint.d.l neg1, neg1 +#else + li.w t1, -1 + vreplgr2vr.w neg1, t1 + vffint.s.w neg1, neg1 +#endif + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res2, VX0, VX1 + vfadd.s res1, res1, res2 +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD $f12, X, 0 * SIZE + FABS $f12, $f12 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res2, VX0, VX1 + vfadd.s res1, res1, res2 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f12, X, 0 * SIZE + FABS $f12, $f12 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/daxpby_lasx.S b/kernel/loongarch64/axpby_lasx.S similarity index 54% rename from kernel/loongarch64/daxpby_lasx.S rename to kernel/loongarch64/axpby_lasx.S index 4b19703e7..7a246ca5c 100644 --- a/kernel/loongarch64/daxpby_lasx.S +++ b/kernel/loongarch64/axpby_lasx.S @@ -1,6 +1,33 @@ -#define ASSEMBLER +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER #include "common.h" + #define N $r4 #define ALPHA $f0 #define X $r5 @@ -30,18 +57,29 @@ PROLOGUE bge $r0, N, .L999 - li.d TEMP, 1 movgr2fr.d a1, $r0 - ffint.d.l a1, a1 - slli.d TEMP, TEMP, BASE_SHIFT + ffint.s.l a1, a1 slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT - movfr2gr.d t1, ALPHA + MTG t1, ALPHA + MTG t2, BETA + MTG t3, a1 +#ifdef DOUBLE xvreplgr2vr.d VXA, t1 - movfr2gr.d t2, BETA xvreplgr2vr.d VXB, t2 - movfr2gr.d t3, a1 xvreplgr2vr.d VXZ, t3 +#else + xvreplgr2vr.w VXA, t1 + xvreplgr2vr.w VXB, t2 + xvreplgr2vr.w VXZ, t3 +#endif + // If incx == 0 || incy == 0, do one by one + and TEMP, INCX, INCY + or I, N, N + beqz TEMP, .L998 + + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT srai.d I, N, 3 bne INCX, TEMP, .L20 bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 @@ -52,21 +90,22 @@ .L11: bge $r0, I, .L997 - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L110 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L112 // ALPHA!=0 BETA==0 b .L111 // ALPHA!=0 BETA!=0 .align 3 .L110: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L114 // ALPHA==0 BETA==0 b .L113 // ALPHA==0 BETA!=0 .align 3 .L111: // ALPHA!=0 BETA!=0 xvld VX0, X, 0 * SIZE +#ifdef DOUBLE xvld VX2, Y, 0 * SIZE xvld VX1, X, 4 * SIZE xvld VX3, Y, 4 * SIZE @@ -77,6 +116,13 @@ addi.d I, I, -1 xvst VX2, Y, 0 * SIZE xvst VX3, Y, 4 * SIZE +#else + xvld VX2, Y, 0 * SIZE + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvfmadd.s VX2, VX2, VXB, VX0 + xvst VX2, Y, 0 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE blt $r0, I, .L111 @@ -85,34 +131,46 @@ .L112: // ALPHA!=0 BETA==0 xvld VX0, X, 0 * SIZE +#ifdef DOUBLE xvld VX1, X, 4 * SIZE xvfmul.d VX0, VX0, VXA xvfmul.d VX1, VX1, VXA xvst VX0, Y, 0 * SIZE xvst VX1, Y, 4 * SIZE +#else + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvst VX0, Y, 0 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 blt $r0, I, .L112 b .L997 .align 3 .L113: // ALPHA==0 BETA!=0 xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE xvld VX3, Y, 4 * SIZE xvfmul.d VX2, VX2, VXB xvfmul.d VX3, VX3, VXB xvst VX2, Y, 0 * SIZE xvst VX3, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE +#else + xvfmul.s VX2, VX2, VXB + xvst VX2, Y, 0 * SIZE +#endif addi.d I, I, -1 + addi.d Y, Y, 8 * SIZE blt $r0, I, .L113 b .L997 .align 3 .L114: // ALPHA==0 BETA==0 xvst VXZ, Y, 0 * SIZE +#ifdef DOUBLE xvst VXZ, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L114 @@ -122,21 +180,22 @@ .L12: // INCX==1 and INCY!=1 bge $r0, I, .L997 move YY, Y - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L120 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L122 // ALPHA!=0 BETA==0 b .L121 // ALPHA!=0 BETA!=0 .align 3 .L120: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L124 // ALPHA==0 BETA==0 b .L123 // ALPHA==0 BETA!=0 .align 3 .L121: // ALPHA!=0 BETA!=0 xvld VX0, X, 0 * SIZE +#ifdef DOUBLE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -182,14 +241,59 @@ xvstelm.d VX3, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, VX0, VXA + xvfmadd.s VX2, VX2, VXB, VX0 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE + addi.d I, I, -1 blt $r0, I, .L121 b .L997 .align 3 .L122: // ALPHA!=0 BETA==0 xvld VX0, X, 0 * SIZE +#ifdef DOUBLE xvld VX1, X, 4 * SIZE xvfmul.d VX0, VX0, VXA xvfmul.d VX1, VX1, VXA @@ -208,14 +312,33 @@ xvstelm.d VX1, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX1, YY, 0, 3 +#else + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE - addi.d I, I, -1 blt $r0, I, .L122 b .L997 .align 3 .L123: // ALPHA==0 BETA!=0 +#ifdef DOUBLE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -250,7 +373,6 @@ xvstelm.d VX2, YY, 0, 3 add.d YY, YY, INCY xvfmul.d VX3, VX3, VXB - addi.d I, I, -1 xvstelm.d VX3, YY, 0, 0 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 1 @@ -258,12 +380,56 @@ xvstelm.d VX3, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX2, VX2, VXB + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 blt $r0, I, .L123 b .L997 .align 3 .L124: // ALPHA==0 BETA==0 +#ifdef DOUBLE xvstelm.d VXZ, YY, 0, 0 add.d YY, YY, INCY xvstelm.d VXZ, YY, 0, 1 @@ -279,6 +445,23 @@ xvstelm.d VXZ, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VXZ, YY, 0, 3 +#else + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d I, I, -1 blt $r0, I, .L124 @@ -287,21 +470,22 @@ .L21:// INCX!=1 and INCY==1 bge $r0, I, .L997 - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L210 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L212 // ALPHA!=0 BETA==0 b .L211 // ALPHA!=0 BETA!=0 .align 3 .L210: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L214 // ALPHA==0 BETA==0 b .L213 // ALPHA==0 BETA!=0 .align 3 .L211: // ALPHA!=0 BETA!=0 xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -334,12 +518,43 @@ xvfmadd.d VX3, VX3, VXB, VX1 addi.d I, I, -1 xvst VX3, Y, 4 * SIZE +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VX0, VXA, VX0 + xvfmadd.s VX2, VX2, VXB, VX0 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE +#endif addi.d Y, Y, 8 * SIZE blt $r0, I, .L211 b .L997 .align 3 .L212: // ALPHA!=0 BETA==0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -369,6 +584,35 @@ xvfmul.d VX1, VX1, VXA addi.d I, I, -1 xvst VX1, Y, 4 * SIZE +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VX0, VXA, VX0 + addi.d I, I, -1 + xvst VX0, Y, 0 * SIZE +#endif addi.d Y, Y, 8 * SIZE blt $r0, I, .L212 b .L997 @@ -376,20 +620,27 @@ .L213: // ALPHA==0 BETA!=0 xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE xvld VX3, Y, 4 * SIZE xvfmul.d VX2, VX2, VXB xvfmul.d VX3, VX3, VXB - addi.d I, I, -1 xvst VX2, Y, 0 * SIZE xvst VX3, Y, 4 * SIZE +#else + xvfmul.s VX2, VX2, VXB + xvst VX2, Y, 0 * SIZE +#endif addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 blt $r0, I, .L213 b .L997 .align 3 .L214: // ALPHA==0 BETA==0 xvst VXZ, Y, 0 * SIZE +#ifdef DOUBLE xvst VXZ, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L214 @@ -399,20 +650,21 @@ .L22: bge $r0, I, .L997 move YY, Y - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L220 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L222 // ALPHA!=0 BETA==0 b .L221 // ALPHA!=0 BETA!=0 .align 3 .L220: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L224 // ALPHA==0 BETA==0 b .L223 // ALPHA==0 BETA!=0 .align 3 .L221: // ALPHA!=0 BETA!=0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -481,12 +733,81 @@ xvstelm.d VX3, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, VX0, VXA + xvfmadd.s VX2, VX2, VXB, VX0 + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif add.d YY, YY, INCY blt $r0, I, .L221 b .L997 .align 3 .L222: // ALPHA!=0 BETA==0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -529,12 +850,56 @@ xvstelm.d VX1, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX1, YY, 0, 3 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 7 +#endif add.d YY, YY, INCY blt $r0, I, .L222 b .L997 .align 3 .L223: // ALPHA==0 BETA!=0 +#ifdef DOUBLE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -577,12 +942,56 @@ xvstelm.d VX3, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX2, VX2, VXB + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif add.d YY, YY, INCY blt $r0, I, .L223 b .L997 .align 3 .L224: // ALPHA==0 BETA==0 +#ifdef DOUBLE xvstelm.d VXZ, YY, 0, 0 add.d YY, YY, INCY xvstelm.d VXZ, YY, 0, 1 @@ -598,6 +1007,23 @@ xvstelm.d VXZ, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VXZ, YY, 0, 3 +#else + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d I, I, -1 blt $r0, I, .L224 @@ -610,12 +1036,12 @@ .align 3 .L998: - fld.d $f12, X, 0 * SIZE - fld.d $f13, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f13, Y, 0 * SIZE addi.d I, I, -1 - fmul.d $f12, $f12, ALPHA - fmadd.d $f13, $f13, BETA, $f12 - fst.d $f13, Y, 0 * SIZE + MUL $f12, $f12, ALPHA + MADD $f13, $f13, BETA, $f12 + ST $f13, Y, 0 * SIZE add.d X, X, INCX add.d Y, Y, INCY blt $r0, I, .L998 diff --git a/kernel/loongarch64/daxpby_lsx.S b/kernel/loongarch64/axpby_lsx.S similarity index 55% rename from kernel/loongarch64/daxpby_lsx.S rename to kernel/loongarch64/axpby_lsx.S index 9aafbaf2a..e50d4cdcc 100644 --- a/kernel/loongarch64/daxpby_lsx.S +++ b/kernel/loongarch64/axpby_lsx.S @@ -1,6 +1,33 @@ -#define ASSEMBLER +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER #include "common.h" + #define N $r4 #define ALPHA $f0 #define X $r5 @@ -30,18 +57,29 @@ PROLOGUE bge $r0, N, .L999 - li.d TEMP, 1 movgr2fr.d a1, $r0 - ffint.d.l a1, a1 - slli.d TEMP, TEMP, BASE_SHIFT + ffint.s.l a1, a1 slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT - movfr2gr.d t1, ALPHA + MTG t1, ALPHA + MTG t2, BETA + MTG t3, a1 +#ifdef DOUBLE vreplgr2vr.d VXA, t1 - movfr2gr.d t2, BETA vreplgr2vr.d VXB, t2 - movfr2gr.d t3, a1 vreplgr2vr.d VXZ, t3 +#else + vreplgr2vr.w VXA, t1 + vreplgr2vr.w VXB, t2 + vreplgr2vr.w VXZ, t3 +#endif + // If incx == 0 || incy == 0, do one by one + and TEMP, INCX, INCY + or I, N, N + beqz TEMP, .L998 + + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT srai.d I, N, 3 bne INCX, TEMP, .L20 bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 @@ -52,15 +90,15 @@ .L11: bge $r0, I, .L997 - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L110 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L112 // ALPHA!=0 BETA==0 b .L111 // ALPHA!=0 BETA!=0 .align 3 .L110: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L114 // ALPHA==0 BETA==0 b .L113 // ALPHA==0 BETA!=0 .align 3 @@ -68,6 +106,7 @@ .L111: // ALPHA!=0 BETA!=0 vld VX0, X, 0 * SIZE vld VX2, Y, 0 * SIZE +#ifdef DOUBLE vld VX1, X, 2 * SIZE vld VX3, Y, 2 * SIZE vfmul.d VX0, VX0, VXA @@ -86,6 +125,16 @@ vfmadd.d VX3, VX3, VXB, VX1 vst VX2, Y, 4 * SIZE vst VX3, Y, 6 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfmul.s VX0, VX0, VXA + vfmul.s VX1, VX1, VXA + vfmadd.s VX2, VX2, VXB, VX0 + vfmadd.s VX3, VX3, VXB, VX1 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE addi.d I, I, -1 @@ -95,6 +144,7 @@ .L112: // ALPHA!=0 BETA==0 vld VX0, X, 0 * SIZE +#ifdef DOUBLE vld VX1, X, 2 * SIZE vfmul.d VX0, VX0, VXA vfmul.d VX1, VX1, VXA @@ -106,6 +156,13 @@ vfmul.d VX3, VX3, VXA vst VX2, Y, 4 * SIZE vst VX3, Y, 6 * SIZE +#else + vld VX1, X, 4 * SIZE + vfmul.s VX0, VX0, VXA + vfmul.s VX1, VX1, VXA + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE addi.d I, I, -1 @@ -113,7 +170,8 @@ b .L997 .align 3 -.L113: // ALPHA==0 BETA!=0\ +.L113: // ALPHA==0 BETA!=0 +#ifdef DOUBLE vld VX0, Y, 0 * SIZE vld VX1, Y, 2 * SIZE vfmul.d VX0, VX0, VXB @@ -126,6 +184,14 @@ vfmul.d VX3, VX3, VXB vst VX2, Y, 4 * SIZE vst VX3, Y, 6 * SIZE +#else + vld VX2, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + vfmul.s VX2, VX2, VXB + vfmul.s VX3, VX3, VXB + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L113 @@ -134,9 +200,13 @@ .L114: // ALPHA==0 BETA==0 vst VXZ, Y, 0 * SIZE +#ifdef DOUBLE vst VXZ, Y, 2 * SIZE vst VXZ, Y, 4 * SIZE vst VXZ, Y, 6 * SIZE +#else + vst VXZ, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L114 @@ -146,21 +216,22 @@ .L12: // INCX==1 and INCY!=1 bge $r0, I, .L997 move YY, Y - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L120 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L122 // ALPHA!=0 BETA==0 b .L121 // ALPHA!=0 BETA!=0 .align 3 .L120: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L124 // ALPHA==0 BETA==0 b .L123 // ALPHA==0 BETA!=0 .align 3 .L121: // ALPHA!=0 BETA!=0 vld VX0, X, 0 * SIZE +#ifdef DOUBLE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -212,6 +283,53 @@ vstelm.d VX3, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, VX0, VXA + vld VX1, X, 4 * SIZE + vfmadd.s VX2, VX2, VXB, VX0 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX1, VX1, VXA + vfmadd.s VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE blt $r0, I, .L121 @@ -220,6 +338,7 @@ .L122: // ALPHA!=0 BETA==0 vld VX0, X, 0 * SIZE +#ifdef DOUBLE vld VX1, X, 2 * SIZE vfmul.d VX0, VX0, VXA vfmul.d VX1, VX1, VXA @@ -242,6 +361,26 @@ vstelm.d VX1, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX1, YY, 0, 1 +#else + vld VX1, X, 4 * SIZE + vfmul.s VX0, VX0, VXA + vfmul.s VX1, VX1, VXA + vstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE addi.d I, I, -1 @@ -250,6 +389,7 @@ .align 3 .L123: // ALPHA==0 BETA!=0 +#ifdef DOUBLE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -294,12 +434,57 @@ vstelm.d VX3, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX2, VX2, VXB + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX3, VX3, VXB + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif add.d YY, YY, INCY blt $r0, I, .L123 b .L997 .align 3 .L124: // ALPHA==0 BETA==0 +#ifdef DOUBLE vstelm.d VXZ, YY, 0, 0 add.d YY, YY, INCY vstelm.d VXZ, YY, 0, 1 @@ -315,6 +500,23 @@ vstelm.d VXZ, YY, 0, 0 add.d YY, YY, INCY vstelm.d VXZ, YY, 0, 1 +#else + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 +#endif add.d YY, YY, INCY addi.d I, I, -1 blt $r0, I, .L124 @@ -323,21 +525,22 @@ .L21:// INCX!=1 and INCY==1 bge $r0, I, .L997 - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L210 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L212 // ALPHA!=0 BETA==0 b .L211 // ALPHA!=0 BETA!=0 .align 3 .L210: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L214 // ALPHA==0 BETA==0 b .L213 // ALPHA==0 BETA!=0 .align 3 .L211: // ALPHA!=0 BETA!=0 vld VX2, Y, 0 * SIZE +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -378,12 +581,47 @@ vfmadd.d VX3, VX3, VXB, VX1 addi.d I, I, -1 vst VX3, Y, 6 * SIZE +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VX0, VXA, VX0 + vld VX3, Y, 4 * SIZE + vfmadd.s VX2, VX2, VXB, VX0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vst VX2, Y, 0 * SIZE + vfmul.s VX1, VX1, VXA + vfmadd.s VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vst VX3, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE blt $r0, I, .L211 b .L997 .align 3 .L212: // ALPHA!=0 BETA==0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -417,6 +655,37 @@ vfmul.d VX1, VX1, VXA addi.d I, I, -1 vst VX1, Y, 6 * SIZE +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VX0, VXA, VX0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vst VX0, Y, 0 * SIZE + vfmul.s VX1, VX1, VXA + addi.d I, I, -1 + vst VX1, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE blt $r0, I, .L212 b .L997 @@ -424,6 +693,7 @@ .L213: // ALPHA==0 BETA!=0 vld VX2, Y, 0 * SIZE +#ifdef DOUBLE vld VX3, Y, 2 * SIZE vfmul.d VX2, VX2, VXB vfmul.d VX3, VX3, VXB @@ -433,19 +703,30 @@ vld VX3, Y, 6 * SIZE vfmul.d VX2, VX2, VXB vfmul.d VX3, VX3, VXB - addi.d I, I, -1 vst VX2, Y, 4 * SIZE vst VX3, Y, 6 * SIZE +#else + vld VX3, Y, 4 * SIZE + vfmul.s VX2, VX2, VXB + vfmul.s VX3, VX3, VXB + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 blt $r0, I, .L213 b .L997 .align 3 .L214: // ALPHA==0 BETA==0 vst VXZ, Y, 0 * SIZE +#ifdef DOUBLE vst VXZ, Y, 2 * SIZE vst VXZ, Y, 4 * SIZE vst VXZ, Y, 6 * SIZE +#else + vst VXZ, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L214 @@ -455,20 +736,21 @@ .L22: bge $r0, I, .L997 move YY, Y - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L220 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L222 // ALPHA!=0 BETA==0 b .L221 // ALPHA!=0 BETA!=0 .align 3 .L220: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L224 // ALPHA==0 BETA==0 b .L223 // ALPHA==0 BETA!=0 .align 3 .L221: // ALPHA!=0 BETA!=0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -541,12 +823,83 @@ vstelm.d VX3, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, VX0, VXA + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vfmadd.s VX2, VX2, VXB, VX0 + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmul.s VX1, VX1, VXA + addi.d I, I, -1 + vfmadd.s VX3, VX3, VXB, VX1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif add.d YY, YY, INCY blt $r0, I, .L221 b .L997 .align 3 .L222: // ALPHA!=0 BETA==0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -591,12 +944,57 @@ vstelm.d VX1, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX1, YY, 0, 1 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VX0, VX0, VXA + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX1, VX1, VXA + addi.d I, I, -1 + vstelm.w VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 +#endif add.d YY, YY, INCY blt $r0, I, .L222 b .L997 .align 3 .L223: // ALPHA==0 BETA!=0 +#ifdef DOUBLE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -641,12 +1039,57 @@ vstelm.d VX3, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX2, VX2, VXB + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX3, VX3, VXB + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif add.d YY, YY, INCY blt $r0, I, .L223 b .L997 .align 3 .L224: // ALPHA==0 BETA==0 +#ifdef DOUBLE vstelm.d VXZ, YY, 0, 0 add.d YY, YY, INCY vstelm.d VXZ, YY, 0, 1 @@ -662,6 +1105,23 @@ vstelm.d VXZ, YY, 0, 0 add.d YY, YY, INCY vstelm.d VXZ, YY, 0, 1 +#else + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 +#endif add.d YY, YY, INCY addi.d I, I, -1 blt $r0, I, .L224 @@ -674,12 +1134,12 @@ .align 3 .L998: - fld.d $f12, X, 0 * SIZE - fld.d $f13, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f13, Y, 0 * SIZE addi.d I, I, -1 - fmul.d $f12, $f12, ALPHA - fmadd.d $f13, $f13, BETA, $f12 - fst.d $f13, Y, 0 * SIZE + MUL $f12, $f12, ALPHA + MADD $f13, $f13, BETA, $f12 + ST $f13, Y, 0 * SIZE add.d X, X, INCX add.d Y, Y, INCY blt $r0, I, .L998 diff --git a/kernel/loongarch64/daxpy_lasx.S b/kernel/loongarch64/axpy_lasx.S similarity index 52% rename from kernel/loongarch64/daxpy_lasx.S rename to kernel/loongarch64/axpy_lasx.S index bafd871ab..707fd09b5 100644 --- a/kernel/loongarch64/daxpy_lasx.S +++ b/kernel/loongarch64/axpy_lasx.S @@ -1,6 +1,33 @@ -#define ASSEMBLER +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER #include "common.h" + #define N $r4 #define XX $r5 #define YY $r6 @@ -35,16 +62,20 @@ bge $r0, N, .L999 li.d TEMP, 1 movgr2fr.d a1, $r0 - ffint.d.l a1, a1 + FFINT a1, a1 movgr2fr.d a2, TEMP - ffint.d.l a2, a2 - fcmp.ceq.d $fcc0, ALPHA, a1 + FFINT a2, a2 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L999 slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT - movfr2gr.d t1, ALPHA + MTG t1, ALPHA +#ifdef DOUBLE xvreplgr2vr.d VXA, t1 +#else + xvreplgr2vr.w VXA, t1 +#endif srai.d I, N, 3 bne INCX, TEMP, .L20 @@ -56,11 +87,12 @@ .L11: bge $r0, I, .L113 - fcmp.ceq.d $fcc0, ALPHA, a2 + CMPEQ $fcc0, ALPHA, a2 bceqz $fcc0, .L112 .align 3 .L111: +#ifdef DOUBLE xvld VX0, X, 0 * SIZE xvld VX2, Y, 0 * SIZE xvld VX1, X, 4 * SIZE @@ -70,6 +102,13 @@ addi.d I, I, -1 xvst VX2, Y, 0 * SIZE xvst VX3, Y, 4 * SIZE +#else + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + addi.d I, I, -1 + xvfadd.s VX2, VX0, VX2 + xvst VX2, Y, 0 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE blt $r0, I, .L111 @@ -77,6 +116,7 @@ .align 3 .L112: +#ifdef DOUBLE xvld VX0, X, 0 * SIZE xvld VX2, Y, 0 * SIZE xvld VX1, X, 4 * SIZE @@ -86,6 +126,13 @@ addi.d I, I, -1 xvst VX2, Y, 0 * SIZE xvst VX3, Y, 4 * SIZE +#else + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + addi.d I, I, -1 + xvfmadd.s VX2, VX0, VXA, VX2 + xvst VX2, Y, 0 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE blt $r0, I, .L112 @@ -97,11 +144,11 @@ .align 3 .L114: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE addi.d X, X, SIZE addi.d Y, Y, SIZE blt $r0, I, .L114 @@ -114,6 +161,7 @@ .align 3 .L121: +#ifdef DOUBLE xvld VX0, X, 0 * SIZE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY @@ -158,6 +206,50 @@ xvstelm.d VX3, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 3 +#else + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmadd.s VX2, VX0, VXA, VX2 + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE blt $r0, I, .L121 @@ -169,11 +261,11 @@ .align 3 .L123: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE addi.d X, X, SIZE add.d Y, Y, INCY blt $r0, I, .L123 @@ -185,6 +277,7 @@ .align 3 .L211: +#ifdef DOUBLE xvld VX2, Y, 0 * SIZE ld.d t1, X, 0 * SIZE add.d X, X, INCX @@ -217,6 +310,37 @@ addi.d I, I, -1 xvst VX3, Y, 4 * SIZE addi.d Y, Y, 8 * SIZE +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfmadd.s VX2, VX0, VXA, VX2 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE +#endif blt $r0, I, .L211 .align 3 @@ -226,11 +350,11 @@ .align 3 .L213: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE add.d X, X, INCX addi.d Y, Y, SIZE blt $r0, I, .L213 @@ -243,6 +367,7 @@ .align 3 .L222: +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -309,6 +434,73 @@ xvstelm.d VX3, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmadd.s VX2, VX0, VXA, VX2 + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif add.d YY, YY, INCY blt $r0, I, .L222 .align 3 @@ -319,15 +511,14 @@ .align 3 .L224: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE add.d X, X, INCX add.d Y, Y, INCY blt $r0, I, .L224 - b .L999 .align 3 .L999: diff --git a/kernel/loongarch64/daxpy_lsx.S b/kernel/loongarch64/axpy_lsx.S similarity index 53% rename from kernel/loongarch64/daxpy_lsx.S rename to kernel/loongarch64/axpy_lsx.S index fc88f0bb9..0d74e2bce 100644 --- a/kernel/loongarch64/daxpy_lsx.S +++ b/kernel/loongarch64/axpy_lsx.S @@ -1,6 +1,33 @@ -#define ASSEMBLER +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER #include "common.h" + #define N $r4 #define XX $r5 #define YY $r6 @@ -35,16 +62,20 @@ bge $r0, N, .L999 li.d TEMP, 1 movgr2fr.d a1, $r0 - ffint.d.l a1, a1 + FFINT a1, a1 movgr2fr.d a2, TEMP - ffint.d.l a2, a2 - fcmp.ceq.d $fcc0, ALPHA, a1 + FFINT a2, a2 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L999 slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT - movfr2gr.d t1, ALPHA + MTG t1, ALPHA +#ifdef DOUBLE vreplgr2vr.d VXA, t1 +#else + vreplgr2vr.w VXA, t1 +#endif srai.d I, N, 3 bne INCX, TEMP, .L20 @@ -56,11 +87,12 @@ .L11: bge $r0, I, .L113 - fcmp.ceq.d $fcc0, ALPHA, a2 + CMPEQ $fcc0, ALPHA, a2 bceqz $fcc0, .L112 .align 3 .L111: +#ifdef DOUBLE vld VX0, X, 0 * SIZE vld VX2, Y, 0 * SIZE vld VX1, X, 2 * SIZE @@ -75,16 +107,27 @@ vld VX3, Y, 6 * SIZE vfadd.d VX2, VX0, VX2 vfadd.d VX3, VX1, VX3 - addi.d I, I, -1 vst VX2, Y, 4 * SIZE vst VX3, Y, 6 * SIZE +#else + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfadd.s VX2, VX0, VX2 + vfadd.s VX3, VX1, VX3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 blt $r0, I, .L111 b .L113 .align 3 .L112: +#ifdef DOUBLE vld VX0, X, 0 * SIZE vld VX2, Y, 0 * SIZE vld VX1, X, 2 * SIZE @@ -104,6 +147,19 @@ vst VX2, Y, 4 * SIZE vst VX3, Y, 6 * SIZE addi.d Y, Y, 8 * SIZE +#else + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfmadd.s VX2, VX0, VXA, VX2 + vfmadd.s VX3, VX1, VXA, VX3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 +#endif blt $r0, I, .L112 .align 3 @@ -113,11 +169,11 @@ .align 3 .L114: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE addi.d X, X, SIZE addi.d Y, Y, SIZE blt $r0, I, .L114 @@ -130,6 +186,7 @@ .align 3 .L121: +#ifdef DOUBLE vld VX0, X, 0 * SIZE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY @@ -180,6 +237,54 @@ add.d YY, YY, INCY addi.d X, X, 8 * SIZE addi.d I, I, -1 +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX2, VX0, VXA, VX2 + vld VX1, X, 4 * SIZE + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE +#endif blt $r0, I, .L121 .align 3 @@ -189,11 +294,11 @@ .align 3 .L123: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE addi.d X, X, SIZE add.d Y, Y, INCY blt $r0, I, .L123 @@ -205,6 +310,7 @@ .align 3 .L211: +#ifdef DOUBLE vld VX2, Y, 0 * SIZE ld.d t1, X, 0 * SIZE add.d X, X, INCX @@ -242,6 +348,39 @@ vfmadd.d VX3, VX1, VXA, VX3 addi.d I, I, -1 vst VX3, Y, 6 * SIZE +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmadd.s VX2, VX0, VXA, VX2 + vld VX3, Y, 4 * SIZE + vst VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + vfmadd.s VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vst VX3, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE blt $r0, I, .L211 .align 3 @@ -252,11 +391,11 @@ .align 3 .L213: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE add.d X, X, INCX addi.d Y, Y, SIZE blt $r0, I, .L213 @@ -269,6 +408,7 @@ .align 3 .L222: +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -337,6 +477,74 @@ vstelm.d VX3, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX2, VX0, VXA, VX2 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif add.d YY, YY, INCY blt $r0, I, .L222 .align 3 @@ -347,11 +555,11 @@ .align 3 .L224: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE add.d X, X, INCX add.d Y, Y, INCY blt $r0, I, .L224 diff --git a/kernel/loongarch64/camax_lasx.S b/kernel/loongarch64/camax_lasx.S new file mode 100644 index 000000000..f9a4e9012 --- /dev/null +++ b/kernel/loongarch64/camax_lasx.S @@ -0,0 +1,212 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $f14 +#define t2 $f18 +#define t3 $f15 +#define t4 $f17 +#define s1 $f22 +#define s2 $f9 +#define s3 $f10 +#define s4 $f11 +#define TEMP $r16 +#define a0 $f20 +#define a1 $f21 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VT0 $xr13 +#define VT1 $xr14 +#define res0 $xr18 +#define neg1 $xr19 +#define VX0 $xr20 +#define VX1 $xr21 +#define VM0 $xr22 +#define VM1 $xr23 + + PROLOGUE + xvxor.v VM0, VM0, VM0 + xvxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L23 + .align 3 + +.L10: + xvld VX0, X, 0 + xvld VX1, X, 32 +#ifdef DOUBLE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 +#else + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 +#endif + XVFSUB x3, res0, x1 + XVFSUB x4, res0, x2 + XVFMAX x1, x1, x3 + XVFMAX x2, x2, x4 + XVFADD VM1, x1, x2 + XVFMAX VM0, VM0, VM1 +#ifdef DOUBLE + xvld VX0, X, 64 + xvld VX1, X, 96 + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + XVFSUB x3, res0, x1 + XVFSUB x4, res0, x2 + XVFMAX x1, x1, x3 + XVFMAX x2, x2, x4 + XVFADD VM1, x1, x2 + XVFMAX VM0, VM0, VM1 +#endif + addi.d I, I, -1 + addi.d X, X, 16 * SIZE + blt $r0, I, .L10 + .align 3 + +.L11: +#ifdef DOUBLE + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + XVFMAX VM0, x1, x2 +#else + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + XVFMAX VM0, x1, x2 + XVFMAX VM1, x3, x4 + XVFMAX VM0, VM0, VM1 +#endif + b .L23 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L23 + .align 3 + +.L21: + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE + add.d X, X, INCX + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE + add.d X, X, INCX + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE + add.d X, X, INCX + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE + add.d X, X, INCX + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE + add.d X, X, INCX + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE + add.d X, X, INCX + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + addi.d I, I, -1 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s3, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE + add.d X, X, INCX + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE + add.d X, X, INCX + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s4, t1, t3 + blt $r0, I, .L21 + .align 3 + +.L22: + FMAX s1, s1, s2 + FMAX s3, s3, s4 + FMAX s1, s1, s3 + .align 3 + +.L23: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + add.d X, X, INCX + FMAX s1, a0, s1 + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/camax_lsx.S b/kernel/loongarch64/camax_lsx.S new file mode 100644 index 000000000..cf46cb016 --- /dev/null +++ b/kernel/loongarch64/camax_lsx.S @@ -0,0 +1,239 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $f14 +#define t2 $f18 +#define t3 $f15 +#define t4 $f17 +#define s1 $f22 +#define s2 $f9 +#define s3 $f10 +#define s4 $f11 +#define TEMP $r16 +#define a0 $f20 +#define a1 $f21 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VT0 $vr13 +#define VT1 $vr14 +#define res0 $vr18 +#define neg1 $vr19 +#define VX0 $vr20 +#define VX1 $vr21 +#define VM0 $vr22 +#define VM1 $vr23 + + PROLOGUE + vxor.v VM0, VM0, VM0 + vxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L23 + .align 3 + +.L10: + vld VX0, X, 0 + vld VX1, X, 16 +#ifdef DOUBLE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 +#endif + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD VM1, x1, x2 + + vld VX0, X, 32 + vld VX1, X, 48 +#ifdef DOUBLE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 +#endif + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD x1, x1, x2 + VFMAX VM1, x1, VM1 + VFMAX VM0, VM0, VM1 +#ifdef DOUBLE + vld VX0, X, 64 + vld VX1, X, 80 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD VM1, x1, x2 + + vld VX0, X, 96 + vld VX1, X, 112 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD x1, x1, x2 + VFMAX VM1, x1, VM1 + VFMAX VM0, VM0, VM1 +#endif + addi.d X, X, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L10 + .align 3 + +.L11: +#ifdef DOUBLE + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + VFMAX VM0, x1, x2 +#else + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + VFMAX VM1, x1, x2 + VFMAX VM0, x3, x4 + VFMAX VM0, VM0, VM1 +#endif + b .L23 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L23 + .align 3 + +.L21: + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE + add.d X, X, INCX + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE + add.d X, X, INCX + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE + add.d X, X, INCX + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE + add.d X, X, INCX + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE + add.d X, X, INCX + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE + add.d X, X, INCX + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + addi.d I, I, -1 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s3, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE + add.d X, X, INCX + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE + add.d X, X, INCX + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s4, t1, t3 + blt $r0, I, .L21 + .align 3 + +.L22: + FMAX s1, s1, s2 + FMAX s3, s3, s4 + FMAX s1, s1, s3 + .align 3 + +.L23: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + add.d X, X, INCX + FMAX s1, a0, s1 + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/camin_lasx.S b/kernel/loongarch64/camin_lasx.S new file mode 100644 index 000000000..c1c4c98c8 --- /dev/null +++ b/kernel/loongarch64/camin_lasx.S @@ -0,0 +1,221 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define TEMP $r16 +#define t1 $f14 +#define t2 $f18 +#define t3 $f15 +#define t4 $f17 +#define s1 $f22 +#define s2 $f9 +#define s3 $f10 +#define s4 $f11 +#define a0 $f20 +#define a1 $f21 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VT0 $xr13 +#define VT1 $xr14 +#define res0 $xr18 +#define neg1 $xr19 +#define VX0 $xr20 +#define VX1 $xr21 +#define VM0 $xr22 +#define VM1 $xr23 + + PROLOGUE + MTC s1, $r0 + xvxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + FABS a0, a0 + FABS a1, a1 + ADD s1, a1, a0 +#ifdef DOUBLE + xvreplve0.d VM0, VM0 +#else + xvreplve0.w VM0, VM0 +#endif + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L23 + .align 3 + +.L10: + xvld VX0, X, 0 + xvld VX1, X, 32 +#ifdef DOUBLE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 +#else + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 +#endif + XVFSUB x3, res0, x1 + XVFSUB x4, res0, x2 + XVFMAX x1, x1, x3 + XVFMAX x2, x2, x4 + XVFADD VM1, x1, x2 + XVFMIN VM0, VM0, VM1 +#ifdef DOUBLE + xvld VX0, X, 64 + xvld VX1, X, 96 + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + XVFSUB x3, res0, x1 + XVFSUB x4, res0, x2 + XVFMAX x1, x1, x3 + XVFMAX x2, x2, x4 + XVFADD VM1, x1, x2 + XVFMIN VM0, VM0, VM1 +#endif + addi.d I, I, -1 + addi.d X, X, 16 * SIZE + blt $r0, I, .L10 + .align 3 + +.L11: +#ifdef DOUBLE + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + XVFMIN VM0, x1, x2 +#else + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + XVFMIN VM0, x1, x2 + XVFMIN VM1, x3, x4 + XVFMIN VM0, VM0, VM1 +#endif + b .L23 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L23 + .align 3 + +.L21: + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE + add.d X, X, INCX + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE + add.d X, X, INCX + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE + add.d X, X, INCX + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE + add.d X, X, INCX + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE + add.d X, X, INCX + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE + add.d X, X, INCX + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + addi.d I, I, -1 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s3, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE + add.d X, X, INCX + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE + add.d X, X, INCX + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s4, t1, t3 + blt $r0, I, .L21 + .align 3 + +.L22: + FMIN s1, s1, s2 + FMIN s3, s3, s4 + FMIN s1, s1, s3 + .align 3 + +.L23: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + add.d X, X, INCX + FMIN s1, a0, s1 + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/camin_lsx.S b/kernel/loongarch64/camin_lsx.S new file mode 100644 index 000000000..ff666ea8f --- /dev/null +++ b/kernel/loongarch64/camin_lsx.S @@ -0,0 +1,248 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $f14 +#define t2 $f18 +#define t3 $f15 +#define t4 $f17 +#define s1 $f22 +#define s2 $f9 +#define s3 $f10 +#define s4 $f11 +#define TEMP $r16 +#define a0 $f20 +#define a1 $f21 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VT0 $vr13 +#define VT1 $vr14 +#define res0 $vr18 +#define neg1 $vr19 +#define VX0 $vr20 +#define VX1 $vr21 +#define VM0 $vr22 +#define VM1 $vr23 + + PROLOGUE + MTC s1, $r0 + vxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + FABS a0, a0 + FABS a1, a1 + ADD s1, a1, a0 +#ifdef DOUBLE + vreplvei.d VM0, VM0, 0 +#else + vreplvei.w VM0, VM0, 0 +#endif + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L23 + .align 3 + +.L10: + vld VX0, X, 0 + vld VX1, X, 16 +#ifdef DOUBLE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 +#endif + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD VM1, x1, x2 + + vld VX0, X, 32 + vld VX1, X, 48 +#ifdef DOUBLE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 +#endif + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD x1, x1, x2 + VFMIN VM1, x1, VM1 + VFMIN VM0, VM0, VM1 +#ifdef DOUBLE + vld VX0, X, 64 + vld VX1, X, 80 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD VM1, x1, x2 + + vld VX0, X, 96 + vld VX1, X, 112 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD x1, x1, x2 + VFMIN VM1, x1, VM1 + VFMIN VM0, VM0, VM1 +#endif + addi.d I, I, -1 + addi.d X, X, 16 * SIZE + blt $r0, I, .L10 + .align 3 + +.L11: +#ifdef DOUBLE + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + VFMIN VM0, x1, x2 +#else + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + VFMIN VM1, x1, x2 + VFMIN VM0, x3, x4 + VFMIN VM0, VM0, VM1 +#endif + b .L23 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L23 + .align 3 + +.L21: + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE + add.d X, X, INCX + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE + add.d X, X, INCX + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE + add.d X, X, INCX + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE + add.d X, X, INCX + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE + add.d X, X, INCX + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE + add.d X, X, INCX + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + addi.d I, I, -1 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s3, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE + add.d X, X, INCX + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE + add.d X, X, INCX + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s4, t1, t3 + blt $r0, I, .L21 + .align 3 + +.L22: + FMIN s1, s1, s2 + FMIN s3, s3, s4 + FMIN s1, s1, s3 + .align 3 + +.L23: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + add.d X, X, INCX + FMIN s1, a0, s1 + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/casum_lasx.S b/kernel/loongarch64/casum_lasx.S new file mode 100644 index 000000000..caf0ff969 --- /dev/null +++ b/kernel/loongarch64/casum_lasx.S @@ -0,0 +1,329 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define res1 $xr16 +#define res2 $xr17 +#define res3 $xr18 +#define res0 $xr19 +#define neg1 $xr20 +#define VT0 $xr21 +#define VT1 $xr22 + + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 +#ifdef DOUBLE + li.d t1, -1 + xvreplgr2vr.d neg1, t1 + xvffint.d.l neg1, neg1 +#else + li.w t1, -1 + xvreplgr2vr.w neg1, t1 + xvffint.s.w neg1, neg1 +#endif + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + xvld VX2, X, 8 * SIZE + xvld VX3, X, 12 * SIZE + xvfmul.d VX0, neg1, VX2 + xvfmul.d VX1, neg1, VX3 + xvfcmp.clt.d VT0, VX2, res0 + xvfcmp.clt.d VT1, VX3, res0 + xvbitsel.v VX2, VX2, VX0, VT0 + xvbitsel.v VX3, VX3, VX1, VT1 + xvfadd.d res2, VX2, VX3 + xvfadd.d res1, res1, res2 +#else + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + xvfmul.s VX2, neg1, VX0 + xvfmul.s VX3, neg1, VX1 + xvfcmp.clt.s VT0, VX0, res0 + xvfcmp.clt.s VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.s res2, VX0, VX1 + xvfadd.s res1, res2, res1 +#endif + addi.d X, X, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS a1, a1 + FABS a2, a2 + addi.d I, I, -1 + ADD a1, a1, a2 + ADD s1, a1, s1 + addi.d X, X, 2 * SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 0 + xvinsgr2vr.w VX1, t2, 1 + xvinsgr2vr.w VX1, t3, 2 + xvinsgr2vr.w VX1, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 4 + xvinsgr2vr.w VX1, t2, 5 + xvinsgr2vr.w VX1, t3, 6 + xvinsgr2vr.w VX1, t4, 7 + xvfmul.s VX2, neg1, VX0 + xvfmul.s VX3, neg1, VX1 + xvfcmp.clt.s VT0, VX0, res0 + xvfcmp.clt.s VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.s res2, VX0, VX1 + xvfadd.s res1, res2, res1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS a1, a1 + FABS a2, a2 + addi.d I, I, -1 + ADD a1, a1, a2 + ADD s1, a1, s1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/casum_lsx.S b/kernel/loongarch64/casum_lsx.S new file mode 100644 index 000000000..4822f2080 --- /dev/null +++ b/kernel/loongarch64/casum_lsx.S @@ -0,0 +1,358 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define res1 $vr16 +#define res2 $vr17 +#define res3 $vr18 +#define res0 $vr19 +#define neg1 $vr20 +#define VT0 $vr21 +#define VT1 $vr22 + + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 +#ifdef DOUBLE + li.d t1, -1 + vreplgr2vr.d neg1, t1 + vffint.d.l neg1, neg1 +#else + li.w t1, -1 + vreplgr2vr.w neg1, t1 + vffint.s.w neg1, neg1 +#endif + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX2, X, 4 * SIZE + vld VX3, X, 6 * SIZE + vfmul.d VX0, neg1, VX2 + vfmul.d VX1, neg1, VX3 + vfcmp.clt.d VT0, VX2, res0 + vfcmp.clt.d VT1, VX3, res0 + vbitsel.v VX2, VX2, VX0, VT0 + vbitsel.v VX3, VX3, VX1, VT1 + vfadd.d res2, VX2, VX3 + vfadd.d res1, res1, res2 + vld VX0, X, 8 * SIZE + vld VX1, X, 10 * SIZE + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX2, X, 12 * SIZE + vld VX3, X, 14 * SIZE + vfmul.d VX0, neg1, VX2 + vfmul.d VX1, neg1, VX3 + vfcmp.clt.d VT0, VX2, res0 + vfcmp.clt.d VT1, VX3, res0 + vbitsel.v VX2, VX2, VX0, VT0 + vbitsel.v VX3, VX3, VX1, VT1 + vfadd.d res2, VX2, VX3 + vfadd.d res1, res1, res2 + addi.d I, I, -1 +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res2, VX0, VX1 + vld VX0, X, 8 * SIZE + vld VX1, X, 12 * SIZE + addi.d I, I, -1 + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res3, VX1, VX0 + vfadd.s res2, res3, res2 + vfadd.s res1, res1, res2 +#endif + addi.d X, X, 16 * SIZE + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS a1, a1 + FABS a2, a2 + addi.d I, I, -1 + ADD a1, a1, a2 + ADD s1, a1, s1 + addi.d X, X, 2 * SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res2, VX0, VX1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vfmul.s VX0, neg1, VX2 + vfmul.s VX1, neg1, VX3 + vfcmp.clt.s VT0, VX2, res0 + vfcmp.clt.s VT1, VX3, res0 + vbitsel.v VX2, VX2, VX0, VT0 + vbitsel.v VX3, VX3, VX1, VT1 + vfadd.s res3, VX2, VX3 + vfadd.s res2, res3, res2 + vfadd.s res1, res1, res2 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS a1, a1 + FABS a2, a2 + addi.d I, I, -1 + ADD a1, a1, a2 + ADD s1, a1, s1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/caxpby_lasx.S b/kernel/loongarch64/caxpby_lasx.S new file mode 100644 index 000000000..c5802092e --- /dev/null +++ b/kernel/loongarch64/caxpby_lasx.S @@ -0,0 +1,1046 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r5 +#define INCX $r6 +#define BETAR $f2 +#define BETAI $f3 +#define Y $r7 +#define INCY $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXAR $xr23 +#define VXAI $xr19 +#define VXBR $xr14 +#define VXBI $xr13 +#define VXZ $xr12 +#define x1 $xr18 +#define x2 $xr17 +#define x3 $xr16 +#define x4 $xr15 + + PROLOGUE + + bge $r0, N, .L999 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + MTG t1, ALPHAR + MTG t2, ALPHAI + MTG t3, BETAR + MTG t4, BETAI +#ifdef DOUBLE + xvreplgr2vr.d VXAR, t1 + xvreplgr2vr.d VXAI, t2 + xvreplgr2vr.d VXBR, t3 + xvreplgr2vr.d VXBI, t4 +#else + xvreplgr2vr.w VXAR, t1 + xvreplgr2vr.w VXAI, t2 + xvreplgr2vr.w VXBR, t3 + xvreplgr2vr.w VXBI, t4 +#endif + xvxor.v VXZ, VXZ, VXZ + // If incx == 0 || incy == 0, do one by one + and TEMP, INCX, INCY + or I, N, N + beqz TEMP, .L998 + + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT +#ifdef DOUBLE + srai.d I, N, 2 +#else + srai.d I, N, 3 +#endif + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, BETAR, a1 + CMPEQ $fcc1, BETAI, a1 + CMPEQ $fcc2, ALPHAR, a1 + CMPEQ $fcc3, ALPHAI, a1 + bceqz $fcc0, .L13 + bceqz $fcc1, .L13 + b .L14 + .align 3 + +.L13: + bceqz $fcc2, .L114 + bceqz $fcc3, .L114 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) + b .L113 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + +.L14: + bceqz $fcc2, .L112 + bceqz $fcc3, .L112 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) + b .L111 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + .align 3 + +.L111: //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + xvst VXZ, Y, 0 * SIZE +#ifdef DOUBLE + xvst VXZ, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvst VXZ, Y, 8 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 +#else + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 +#endif + XVFMUL x3, VXAI, x2 + XVFMUL x4, VXAI, x1 + XVMSUB x3, VXAR, x1, x3 + XVFMADD x4, VXAR, x2, x4 +#ifdef DOUBLE + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + xvld VX0, Y, 0 * SIZE + xvld VX1, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 +#else + xvld VX0, Y, 0 * SIZE + xvld VX1, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 +#endif + XVFMUL x3, VXBI, x2 + XVFMUL x4, VXBI, x1 + XVMSUB x3, VXBR, x1, x3 + XVFMADD x4, VXBR, x2, x4 +#ifdef DOUBLE + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d X, Y, 8 * SIZE +#else + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 8 * SIZE + addi.d X, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 +#else + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 +#endif + XVFMUL VX0, VXAI, x2 + XVFMUL VX1, VXAI, x1 + XVFMUL VX2, VXBI, x4 + XVFMUL VX3, VXBI, x3 + XVMSUB VX0, VXAR, x1, VX0 + XVFMADD VX1, VXAR, x2, VX1 + XVMSUB VX2, VXBR, x3, VX2 + XVFMADD VX3, VXBR, x4, VX3 + XVFADD x3, VX0, VX2 + XVFADD x4, VX1, VX3 +#ifdef DOUBLE + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L121: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 2 + xvinsgr2vr.d x4, t4, 2 + + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 1 + xvinsgr2vr.d x4, t2, 1 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX1, VXAI, x1 + xvfmul.d VX2, VXBI, x4 + xvfmul.d VX3, VXBI, x3 + xvfmsub.d VX0, VXAR, x1, VX0 + xvfmadd.d VX1, VXAR, x2, VX1 + xvfmsub.d VX2, VXBR, x3, VX2 + xvfmadd.d VX3, VXBR, x4, VX3 + xvfadd.d x3, VX0, VX2 + xvfadd.d x4, VX1, VX3 + addi.d I, I, -1 + xvstelm.d x3, YY, 0 * SIZE, 0 + xvstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 2 + xvstelm.d x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 1 + xvstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 3 + xvstelm.d x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 +#else + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + xvld VX1, X, 8 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + add.d Y, Y, INCY + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + XVFMUL VX0, VXAI, x2 + XVFMUL VX1, VXAI, x1 + XVFMUL VX2, VXBI, x4 + XVFMUL VX3, VXBI, x3 + XVMSUB VX0, VXAR, x1, VX0 + XVFMADD VX1, VXAR, x2, VX1 + XVMSUB VX2, VXBR, x3, VX2 + XVFMADD VX3, VXBR, x4, VX3 + XVFADD x3, VX0, VX2 + XVFADD x4, VX1, VX3 + addi.d I, I, -1 + xvstelm.w x3, YY, 0 * SIZE, 0 + xvstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 1 + xvstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 4 + xvstelm.w x4, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 5 + xvstelm.w x4, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 2 + xvstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 3 + xvstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 6 + xvstelm.w x4, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 7 + xvstelm.w x4, YY, 1 * SIZE, 7 + add.d YY, YY, INCY + addi.d X, X, 16 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 +#endif + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + .align 3 + +.L211: +#ifdef DOUBLE + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 2 + xvinsgr2vr.d x2, t4, 2 + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 1 + xvinsgr2vr.d x2, t2, 1 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX1, VXAI, x1 + xvfmul.d VX2, VXBI, x4 + xvfmul.d VX3, VXBI, x3 + xvfmsub.d VX0, VXAR, x1, VX0 + xvfmadd.d VX1, VXAR, x2, VX1 + xvfmsub.d VX2, VXBR, x3, VX2 + xvfmadd.d VX3, VXBR, x4, VX3 + xvfadd.d x3, VX0, VX2 + xvfadd.d x4, VX1, VX3 + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 +#else + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + xvld VX3, Y, 8 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + XVFMUL VX0, VXAI, x2 + XVFMUL VX1, VXAI, x1 + XVFMUL VX2, VXBI, x4 + XVFMUL VX3, VXBI, x3 + XVMSUB VX0, VXAR, x1, VX0 + XVFMADD VX1, VXAR, x2, VX1 + XVMSUB VX2, VXBR, x3, VX2 + XVFMADD VX3, VXBR, x4, VX3 + XVFADD x3, VX0, VX2 + XVFADD x4, VX1, VX3 + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 8 * SIZE + addi.d Y, Y, 16 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 +#endif + +.L22: + bge $r0, I, .L997 + move YY, Y + CMPEQ $fcc0, BETAR, a1 + CMPEQ $fcc1, BETAI, a1 + CMPEQ $fcc2, ALPHAR, a1 + CMPEQ $fcc3, ALPHAI, a1 + bceqz $fcc0, .L23 + bceqz $fcc1, .L23 + b .L24 + .align 3 + +.L23: + bceqz $fcc2, .L224 + bceqz $fcc3, .L224 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) + b .L223 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + .align 3 + +.L24: + bceqz $fcc2, .L222 + bceqz $fcc3, .L222 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) + b .L221 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + .align 3 + +.L221: //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + xvstelm.d VXZ, Y, 0, 0 + xvstelm.d VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.d VXZ, Y, 0, 0 + xvstelm.d VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.d VXZ, Y, 0, 0 + xvstelm.d VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.d VXZ, Y, 0, 0 + xvstelm.d VXZ, Y, 0, 0 + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 +#else + xvstelm.w VXZ, Y, 0, 0 + xvstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VXZ, Y, 0, 0 + xvstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VXZ, Y, 0, 0 + xvstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 0, 0 + add.d Y, Y, INCY + xvstelm.w VXZ, Y, 0, 0 + xvstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VXZ, Y, 0, 0 + xvstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VXZ, Y, 0, 0 + xvstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VXZ, Y, 0, 0 + xvstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 +#endif + +.L222: //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + xvfmul.d x3, VXAI, x2 + xvfmul.d x4, VXAI, x1 + xvfmsub.d x3, VXAR, x1, x3 + xvfmadd.d x4, VXAR, x2, x4 + addi.d I, I, -1 + xvstelm.d x3, YY, 0 * SIZE, 0 + xvstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 1 + xvstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 2 + xvstelm.d x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 3 + xvstelm.d x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + blt $r0, I, .L222 + b .L997 + .align 3 +#else + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + XVFMUL x3, VXAI, x2 + XVFMUL x4, VXAI, x1 + XVMSUB x3, VXAR, x1, x3 + XVFMADD x4, VXAR, x2, x4 + addi.d I, I, -1 + xvstelm.w x3, YY, 0 * SIZE, 0 + xvstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 1 + xvstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 2 + xvstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 3 + xvstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 4 + xvstelm.w x4, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 5 + xvstelm.w x4, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 6 + xvstelm.w x4, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 7 + xvstelm.w x4, YY, 1 * SIZE, 7 + add.d YY, YY, INCY + blt $r0, I, .L222 + b .L997 + .align 3 +#endif + +.L223: +#ifdef DOUBLE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d Y, Y, INCY + xvfmul.d x3, VXBI, x2 + xvfmul.d x4, VXBI, x1 + xvfmsub.d x3, VXBR, x1, x3 + xvfmadd.d x4, VXBR, x2, x4 + + addi.d I, I, -1 + xvstelm.d x3, YY, 0 * SIZE, 0 + xvstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 1 + xvstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 2 + xvstelm.d x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 3 + xvstelm.d x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + blt $r0, I, .L223 + b .L997 + .align 3 +#else + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d Y, Y, INCY + + XVFMUL x3, VXBI, x2 + XVFMUL x4, VXBI, x1 + XVMSUB x3, VXBR, x1, x3 + XVFMADD x4, VXBR, x2, x4 + addi.d I, I, -1 + xvstelm.w x3, YY, 0 * SIZE, 0 + xvstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 1 + xvstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 2 + xvstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 3 + xvstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 4 + xvstelm.w x4, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 5 + xvstelm.w x4, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 6 + xvstelm.w x4, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 7 + xvstelm.w x4, YY, 1 * SIZE, 7 + add.d YY, YY, INCY + blt $r0, I, .L223 + b .L997 + .align 3 +#endif + +.L224: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX1, VXAI, x1 + xvfmul.d VX2, VXBI, x4 + xvfmul.d VX3, VXBI, x3 + xvfmsub.d VX0, VXAR, x1, VX0 + xvfmadd.d VX1, VXAR, x2, VX1 + xvfmsub.d VX2, VXBR, x3, VX2 + xvfmadd.d VX3, VXBR, x4, VX3 + xvfadd.d x3, VX0, VX2 + xvfadd.d x4, VX1, VX3 + addi.d I, I, -1 + + xvstelm.d x3, YY, 0 * SIZE, 0 + xvstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 1 + xvstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 2 + xvstelm.d x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 3 + xvstelm.d x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + blt $r0, I, .L224 + b .L997 + .align 3 +#else + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + + XVFMUL VX0, VXAI, x2 + XVFMUL VX1, VXAI, x1 + XVFMUL VX2, VXBI, x4 + XVFMUL VX3, VXBI, x3 + XVMSUB VX0, VXAR, x1, VX0 + XVFMADD VX1, VXAR, x2, VX1 + XVMSUB VX2, VXBR, x3, VX2 + XVFMADD VX3, VXBR, x4, VX3 + XVFADD x3, VX0, VX2 + XVFADD x4, VX1, VX3 + addi.d I, I, -1 + + xvstelm.w x3, YY, 0 * SIZE, 0 + xvstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 1 + xvstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 2 + xvstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 3 + xvstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 4 + xvstelm.w x4, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 5 + xvstelm.w x4, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 6 + xvstelm.w x4, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 7 + xvstelm.w x4, YY, 1 * SIZE, 7 + add.d YY, YY, INCY + blt $r0, I, .L224 + b .L997 + .align 3 +#endif + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + addi.d I, I, -1 + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MUL s3, BETAI, a4 + MUL s4, BETAI, a3 + MSUB s1, ALPHAR, a1, s1 + MADD s2, a2, ALPHAR, s2 + MSUB s3, BETAR, a3, s3 + MADD s4, a4, BETAR, s4 + ADD s3, s3, s1 + ADD s4, s4, s2 + ST s3, Y, 0 * SIZE + ST s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/caxpby_lsx.S b/kernel/loongarch64/caxpby_lsx.S new file mode 100644 index 000000000..247ae428e --- /dev/null +++ b/kernel/loongarch64/caxpby_lsx.S @@ -0,0 +1,1029 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r5 +#define INCX $r6 +#define BETAR $f2 +#define BETAI $f3 +#define Y $r7 +#define INCY $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXAR $vr23 +#define VXAI $vr19 +#define VXBR $vr14 +#define VXBI $vr13 +#define VXZ $vr12 +#define x1 $vr18 +#define x2 $vr17 +#define x3 $vr16 +#define x4 $vr15 + + PROLOGUE + + bge $r0, N, .L999 + movgr2fr.d a1, $r0 +#ifdef DOUBLE + ffint.d.l a1, a1 +#else + ffint.s.l a1, a1 +#endif + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT +#ifdef DOUBLE + movfr2gr.d t1, ALPHAR + vreplgr2vr.d VXAR, t1 + movfr2gr.d t2, ALPHAI + vreplgr2vr.d VXAI, t2 + movfr2gr.d t3, BETAR + vreplgr2vr.d VXBR, t3 + movfr2gr.d t4, BETAI + vreplgr2vr.d VXBI, t4 +#else + movfr2gr.s t1, ALPHAR + vreplgr2vr.w VXAR, t1 + movfr2gr.s t2, ALPHAI + vreplgr2vr.w VXAI, t2 + movfr2gr.s t3, BETAR + vreplgr2vr.w VXBR, t3 + movfr2gr.s t4, BETAI + vreplgr2vr.w VXBI, t4 +#endif + vxor.v VXZ, VXZ, VXZ + // If incx == 0 || incy == 0, do one by one + and TEMP, INCX, INCY + or I, N, N + beqz TEMP, .L998 + + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 +#ifdef DOUBLE + fcmp.ceq.d $fcc0, BETAR, a1 + fcmp.ceq.d $fcc1, BETAI, a1 + fcmp.ceq.d $fcc2, ALPHAR, a1 + fcmp.ceq.d $fcc3, ALPHAI, a1 +#else + fcmp.ceq.s $fcc0, BETAR, a1 + fcmp.ceq.s $fcc1, BETAI, a1 + fcmp.ceq.s $fcc2, ALPHAR, a1 + fcmp.ceq.s $fcc3, ALPHAI, a1 +#endif + bceqz $fcc0, .L13 + bceqz $fcc1, .L13 + b .L14 + .align 3 + +.L13: + bceqz $fcc2, .L114 + bceqz $fcc3, .L114 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) + b .L113 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + +.L14: + bceqz $fcc2, .L112 + bceqz $fcc3, .L112 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) + b .L111 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + .align 3 + +.L111: //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + vst VXZ, Y, 0 * SIZE + vst VXZ, Y, 2 * SIZE + vst VXZ, Y, 4 * SIZE + vst VXZ, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 +#else + vst VXZ, Y, 0 * SIZE + vst VXZ, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 +#endif + +.L112: //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXAI, x2 + vfmul.d x4, VXAI, x1 + vfmsub.d x3, VXAR, x1, x3 + vfmadd.d x4, VXAR, x2, x4 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXAI, x2 + vfmul.d x4, VXAI, x1 + vfmsub.d x3, VXAR, x1, x3 + vfmadd.d x4, VXAR, x2, x4 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, VXAI, x2 + vfmul.s x4, VXAI, x1 + vfmsub.s x3, VXAR, x1, x3 + vfmadd.s x4, VXAR, x2, x4 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 +#endif + +.L113: //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + vld VX0, Y, 0 * SIZE + vld VX1, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXBI, x2 + vfmul.d x4, VXBI, x1 + vfmsub.d x3, VXBR, x1, x3 + vfmadd.d x4, VXBR, x2, x4 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + vld VX0, Y, 4 * SIZE + vld VX1, Y, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXBI, x2 + vfmul.d x4, VXBI, x1 + vfmsub.d x3, VXBR, x1, x3 + vfmadd.d x4, VXBR, x2, x4 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 +#else + vld VX0, Y, 0 * SIZE + vld VX1, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, VXBI, x2 + vfmul.s x4, VXBI, x1 + vfmsub.s x3, VXBR, x1, x3 + vfmadd.s x4, VXBR, x2, x4 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 +#endif + +.L114: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, VXAI, x2 + vfmul.d VX1, VXAI, x1 + vfmul.d VX2, VXBI, x4 + vfmul.d VX3, VXBI, x3 + vfmsub.d VX0, VXAR, x1, VX0 + vfmadd.d VX1, VXAR, x2, VX1 + vfmsub.d VX2, VXBR, x3, VX2 + vfmadd.d VX3, VXBR, x4, VX3 + vfadd.d x3, VX0, VX2 + vfadd.d x4, VX1, VX3 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, VXAI, x2 + vfmul.d VX1, VXAI, x1 + vfmul.d VX2, VXBI, x4 + vfmul.d VX3, VXBI, x3 + vfmsub.d VX0, VXAR, x1, VX0 + vfmadd.d VX1, VXAR, x2, VX1 + vfmsub.d VX2, VXBR, x3, VX2 + vfmadd.d VX3, VXBR, x4, VX3 + vfadd.d x3, VX0, VX2 + vfadd.d x4, VX1, VX3 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmul.s VX0, VXAI, x2 + vfmul.s VX1, VXAI, x1 + vfmul.s VX2, VXBI, x4 + vfmul.s VX3, VXBI, x3 + vfmsub.s VX0, VXAR, x1, VX0 + vfmadd.s VX1, VXAR, x2, VX1 + vfmsub.s VX2, VXBR, x3, VX2 + vfmadd.s VX3, VXBR, x4, VX3 + vfadd.s x3, VX0, VX2 + vfadd.s x4, VX1, VX3 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 +#endif + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L121: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d VX0, VXAI, x2 + vfmul.d VX1, VXAI, x1 + vfmul.d VX2, VXBI, x4 + vfmul.d VX3, VXBI, x3 + vfmsub.d VX0, VXAR, x1, VX0 + vfmadd.d VX1, VXAR, x2, VX1 + vfmsub.d VX2, VXBR, x3, VX2 + vfmadd.d VX3, VXBR, x4, VX3 + vfadd.d x3, VX0, VX2 + vfadd.d x4, VX1, VX3 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d VX0, VXAI, x2 + vfmul.d VX1, VXAI, x1 + vfmul.d VX2, VXBI, x4 + vfmul.d VX3, VXBI, x3 + vfmsub.d VX0, VXAR, x1, VX0 + vfmadd.d VX1, VXAR, x2, VX1 + vfmsub.d VX2, VXBR, x3, VX2 + vfmadd.d VX3, VXBR, x4, VX3 + vfadd.d x3, VX0, VX2 + vfadd.d x4, VX1, VX3 + addi.d I, I, -1 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s VX0, VXAI, x2 + vfmul.s VX1, VXAI, x1 + vfmul.s VX2, VXBI, x4 + vfmul.s VX3, VXBI, x3 + vfmsub.s VX0, VXAR, x1, VX0 + vfmadd.s VX1, VXAR, x2, VX1 + vfmsub.s VX2, VXBR, x3, VX2 + vfmadd.s VX3, VXBR, x4, VX3 + vfadd.s x3, VX0, VX2 + vfadd.s x4, VX1, VX3 + addi.d I, I, -1 + vstelm.w x3, YY, 0 * SIZE, 0 + vstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 1 + vstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 2 + vstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 3 + vstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 +#endif + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + .align 3 + +.L211: +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, VXAI, x2 + vfmul.d VX1, VXAI, x1 + vfmul.d VX2, VXBI, x4 + vfmul.d VX3, VXBI, x3 + vfmsub.d VX0, VXAR, x1, VX0 + vfmadd.d VX1, VXAR, x2, VX1 + vfmsub.d VX2, VXBR, x3, VX2 + vfmadd.d VX3, VXBR, x4, VX3 + vfadd.d x3, VX0, VX2 + vfadd.d x4, VX1, VX3 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, VXAI, x2 + vfmul.d VX1, VXAI, x1 + vfmul.d VX2, VXBI, x4 + vfmul.d VX3, VXBI, x3 + vfmsub.d VX0, VXAR, x1, VX0 + vfmadd.d VX1, VXAR, x2, VX1 + vfmsub.d VX2, VXBR, x3, VX2 + vfmadd.d VX3, VXBR, x4, VX3 + vfadd.d x3, VX0, VX2 + vfadd.d x4, VX1, VX3 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + addi.d I, I, -1 + vst VX3, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmul.s VX0, VXAI, x2 + vfmul.s VX1, VXAI, x1 + vfmul.s VX2, VXBI, x4 + vfmul.s VX3, VXBI, x3 + vfmsub.s VX0, VXAR, x1, VX0 + vfmadd.s VX1, VXAR, x2, VX1 + vfmsub.s VX2, VXBR, x3, VX2 + vfmadd.s VX3, VXBR, x4, VX3 + vfadd.s x3, VX0, VX2 + vfadd.s x4, VX1, VX3 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + addi.d I, I, -1 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 +#endif + +.L22: + bge $r0, I, .L997 + move YY, Y +#ifdef DOUBLE + fcmp.ceq.d $fcc0, BETAR, a1 + fcmp.ceq.d $fcc1, BETAI, a1 + fcmp.ceq.d $fcc2, ALPHAR, a1 + fcmp.ceq.d $fcc3, ALPHAI, a1 +#else + fcmp.ceq.s $fcc0, BETAR, a1 + fcmp.ceq.s $fcc1, BETAI, a1 + fcmp.ceq.s $fcc2, ALPHAR, a1 + fcmp.ceq.s $fcc3, ALPHAI, a1 +#endif + bceqz $fcc0, .L23 + bceqz $fcc1, .L23 + b .L24 + .align 3 + +.L23: + bceqz $fcc2, .L224 + bceqz $fcc3, .L224 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) + b .L223 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + .align 3 + +.L24: + bceqz $fcc2, .L222 + bceqz $fcc3, .L222 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) + b .L221 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + .align 3 + +.L221: //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + vstelm.d VXZ, Y, 0, 0 + vstelm.d VXZ, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VXZ, Y, 0, 0 + vstelm.d VXZ, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VXZ, Y, 0, 0 + vstelm.d VXZ, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VXZ, Y, 0, 0 + vstelm.d VXZ, Y, 0, 0 + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 +#else + vstelm.w VXZ, Y, 0, 0 + vstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + vstelm.w VXZ, Y, 0, 0 + vstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + vstelm.w VXZ, Y, 0, 0 + vstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + vstelm.w VXZ, Y, 0, 0 + vstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 +#endif + +.L222: //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vfmul.d x3, VXAI, x2 + vfmul.d x4, VXAI, x1 + vfmsub.d x3, VXAR, x1, x3 + vfmadd.d x4, VXAR, x2, x4 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vfmul.d x3, VXAI, x2 + vfmul.d x4, VXAI, x1 + vfmsub.d x3, VXAR, x1, x3 + vfmadd.d x4, VXAR, x2, x4 + addi.d I, I, -1 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + blt $r0, I, .L222 + b .L997 + .align 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + vfmul.s x3, VXAI, x2 + vfmul.s x4, VXAI, x1 + vfmsub.s x3, VXAR, x1, x3 + vfmadd.s x4, VXAR, x2, x4 + addi.d I, I, -1 + vstelm.w x3, YY, 0 * SIZE, 0 + vstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 1 + vstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 2 + vstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 3 + vstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + blt $r0, I, .L222 + b .L997 + .align 3 +#endif + +.L223: +#ifdef DOUBLE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d Y, Y, INCY + vfmul.d x3, VXBI, x2 + vfmul.d x4, VXBI, x1 + vfmsub.d x3, VXBR, x1, x3 + vfmadd.d x4, VXBR, x2, x4 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d Y, Y, INCY + vfmul.d x3, VXBI, x2 + vfmul.d x4, VXBI, x1 + vfmsub.d x3, VXBR, x1, x3 + vfmadd.d x4, VXBR, x2, x4 + addi.d I, I, -1 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + blt $r0, I, .L223 + b .L997 + .align 3 +#else + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d Y, Y, INCY + vfmul.s x3, VXBI, x2 + vfmul.s x4, VXBI, x1 + vfmsub.s x3, VXBR, x1, x3 + vfmadd.s x4, VXBR, x2, x4 + + addi.d I, I, -1 + vstelm.w x3, YY, 0 * SIZE, 0 + vstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 1 + vstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 2 + vstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 3 + vstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + blt $r0, I, .L223 + b .L997 + .align 3 +#endif + +.L224: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, VXAI, x2 + vfmul.d VX1, VXAI, x1 + vfmul.d VX2, VXBI, x4 + vfmul.d VX3, VXBI, x3 + vfmsub.d VX0, VXAR, x1, VX0 + vfmadd.d VX1, VXAR, x2, VX1 + vfmsub.d VX2, VXBR, x3, VX2 + vfmadd.d VX3, VXBR, x4, VX3 + vfadd.d x3, VX0, VX2 + vfadd.d x4, VX1, VX3 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, VXAI, x2 + vfmul.d VX1, VXAI, x1 + vfmul.d VX2, VXBI, x4 + vfmul.d VX3, VXBI, x3 + vfmsub.d VX0, VXAR, x1, VX0 + vfmadd.d VX1, VXAR, x2, VX1 + vfmsub.d VX2, VXBR, x3, VX2 + vfmadd.d VX3, VXBR, x4, VX3 + vfadd.d x3, VX0, VX2 + vfadd.d x4, VX1, VX3 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + b .L997 + .align 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, VXAI, x2 + vfmul.s VX1, VXAI, x1 + vfmul.s VX2, VXBI, x4 + vfmul.s VX3, VXBI, x3 + vfmsub.s VX0, VXAR, x1, VX0 + vfmadd.s VX1, VXAR, x2, VX1 + vfmsub.s VX2, VXBR, x3, VX2 + vfmadd.s VX3, VXBR, x4, VX3 + vfadd.s x3, VX0, VX2 + vfadd.s x4, VX1, VX3 + addi.d I, I, -1 + + vstelm.w x3, YY, 0 * SIZE, 0 + vstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 1 + vstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 2 + vstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 3 + vstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + blt $r0, I, .L224 + b .L997 + .align 3 +#endif + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: +#ifdef DOUBLE + fld.d a1, X, 0 * SIZE + fld.d a2, X, 1 * SIZE + fld.d a3, Y, 0 * SIZE + fld.d a4, Y, 1 * SIZE + addi.d I, I, -1 + fmul.d s1, ALPHAI, a2 + fmul.d s2, ALPHAI, a1 + fmul.d s3, BETAI, a4 + fmul.d s4, BETAI, a3 + fmsub.d s1, ALPHAR, a1, s1 + fmadd.d s2, a2, ALPHAR, s2 + fmsub.d s3, BETAR, a3, s3 + fmadd.d s4, a4, BETAR, s4 + fadd.d s3, s3, s1 + fadd.d s4, s4, s2 + fst.d s3, Y, 0 * SIZE + fst.d s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 +#else + fld.s a1, X, 0 * SIZE + fld.s a2, X, 1 * SIZE + fld.s a3, Y, 0 * SIZE + fld.s a4, Y, 1 * SIZE + addi.d I, I, -1 + fmul.s s1, ALPHAI, a2 + fmul.s s2, ALPHAI, a1 + fmul.s s3, BETAI, a4 + fmul.s s4, BETAI, a3 + fmsub.s s1, ALPHAR, a1, s1 + fmadd.s s2, a2, ALPHAR, s2 + fmsub.s s3, BETAR, a3, s3 + fmadd.s s4, a4, BETAR, s4 + fadd.s s3, s3, s1 + fadd.s s4, s4, s2 + fst.s s3, Y, 0 * SIZE + fst.s s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 +#endif +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/caxpy_lasx.S b/kernel/loongarch64/caxpy_lasx.S new file mode 100644 index 000000000..2b970fe70 --- /dev/null +++ b/kernel/loongarch64/caxpy_lasx.S @@ -0,0 +1,707 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define XX $r5 +#define YY $r6 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXAR $xr23 +#define VXAI $xr19 +#define x1 $xr18 +#define x2 $xr17 +#define x3 $xr16 +#define x4 $xr15 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L10 + bcnez $fcc1, .L999 +.L10: + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + MTG t1, ALPHAR + MTG t2, ALPHAI +#ifdef DOUBLE + xvreplgr2vr.d VXAR, t1 + xvreplgr2vr.d VXAI, t2 + srai.d I, N, 2 +#else + xvreplgr2vr.w VXAR, t1 + xvreplgr2vr.w VXAI, t2 + srai.d I, N, 3 +#endif + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + .align 3 + +.L111: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 +#else + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 8 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmsub.d VX1, VXAR, x1, VX0 + xvfmadd.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfadd.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmsub.s VX1, VXAR, x1, VX0 + xvfmadd.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmadd.d VX1, VXAR, x1, VX0 + xvfmsub.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfsub.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmadd.s VX1, VXAR, x1, VX0 + xvfmsub.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L121: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 2 + xvinsgr2vr.d x4, t4, 2 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 1 + xvinsgr2vr.d x4, t2, 1 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 +#else + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + xvld VX1, X, 8 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmsub.d VX1, VXAR, x1, VX0 + xvfmadd.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfadd.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmsub.s VX1, VXAR, x1, VX0 + xvfmadd.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmadd.d VX1, VXAR, x1, VX0 + xvfmsub.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfsub.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmadd.s VX1, VXAR, x1, VX0 + xvfmsub.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + xvstelm.d x3, YY, 0 * SIZE, 0 + xvstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 2 + xvstelm.d x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 1 + xvstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 3 + xvstelm.d x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 +#else + addi.d I, I, -1 + xvstelm.w x3, YY, 0 * SIZE, 0 + xvstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 1 + xvstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 4 + xvstelm.w x4, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 5 + xvstelm.w x4, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 2 + xvstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 3 + xvstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 6 + xvstelm.w x4, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 7 + xvstelm.w x4, YY, 1 * SIZE, 7 + add.d YY, YY, INCY + addi.d X, X, 16 * SIZE +#endif + blt $r0, I, .L121 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + .align 3 + +.L211: +#ifdef DOUBLE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 2 + xvinsgr2vr.d x2, t4, 2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 1 + xvinsgr2vr.d x2, t2, 1 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + xvld VX3, Y, 8 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmsub.d VX1, VXAR, x1, VX0 + xvfmadd.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfadd.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmsub.s VX1, VXAR, x1, VX0 + xvfmadd.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmadd.d VX1, VXAR, x1, VX0 + xvfmsub.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfsub.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmadd.s VX1, VXAR, x1, VX0 + xvfmsub.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 8 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + blt $r0, I, .L211 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L222: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmsub.d VX1, VXAR, x1, VX0 + xvfmadd.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfadd.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmsub.s VX1, VXAR, x1, VX0 + xvfmadd.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmadd.d VX1, VXAR, x1, VX0 + xvfmsub.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfsub.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmadd.s VX1, VXAR, x1, VX0 + xvfmsub.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfsub.s x4, x4, VX3 +#endif +#endif + addi.d I, I, -1 +#ifdef DOUBLE + xvstelm.d x3, YY, 0 * SIZE, 0 + xvstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 1 + xvstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 2 + xvstelm.d x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 3 + xvstelm.d x4, YY, 1 * SIZE, 3 +#else + xvstelm.w x3, YY, 0 * SIZE, 0 + xvstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 1 + xvstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 2 + xvstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 3 + xvstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 4 + xvstelm.w x4, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 5 + xvstelm.w x4, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 6 + xvstelm.w x4, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 7 + xvstelm.w x4, YY, 1 * SIZE, 7 +#endif + add.d YY, YY, INCY + blt $r0, I, .L222 + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 3 +#else + andi I, N, 7 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + addi.d I, I, -1 +#if !defined(CONJ) + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MSUB s3, ALPHAR, a1, s1 + MADD s4, a2, ALPHAR, s2 + ADD s3, s3, a3 + ADD s4, s4, a4 +#else + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MADD s3, ALPHAR, a1, s1 + MSUB s4, a2, ALPHAR, s2 + ADD s3, s3, a3 + SUB s4, a4, s4 +#endif + ST s3, Y, 0 * SIZE + ST s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/caxpy_lsx.S b/kernel/loongarch64/caxpy_lsx.S new file mode 100644 index 000000000..85598d0b9 --- /dev/null +++ b/kernel/loongarch64/caxpy_lsx.S @@ -0,0 +1,679 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define XX $r5 +#define YY $r6 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXAR $vr23 +#define VXAI $vr19 +#define x1 $vr18 +#define x2 $vr17 +#define x3 $vr16 +#define x4 $vr15 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L10 + bcnez $fcc1, .L999 +.L10: + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + MTG t1, ALPHAR + MTG t2, ALPHAI +#ifdef DOUBLE + vreplgr2vr.d VXAR, t1 + vreplgr2vr.d VXAI, t2 +#else + vreplgr2vr.w VXAR, t1 + vreplgr2vr.w VXAI, t2 +#endif + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + .align 3 + +.L111: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 +#else + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmsub.s VX1, VXAR, x1, VX0 + vfmadd.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmadd.s VX1, VXAR, x1, VX0 + vfmsub.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 +#if !defined(CONJ) + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#endif + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE +#else + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L121: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmsub.s VX1, VXAR, x1, VX0 + vfmadd.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmadd.s VX1, VXAR, x1, VX0 + vfmsub.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#if !defined(CONJ) + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#endif + addi.d I, I, -1 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 +#else + addi.d I, I, -1 + vstelm.w x3, YY, 0 * SIZE, 0 + vstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 1 + vstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 2 + vstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 3 + vstelm.w x4, YY, 1 * SIZE, 3 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + .align 3 + +.L211: +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmsub.s VX1, VXAR, x1, VX0 + vfmadd.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmadd.s VX1, VXAR, x1, VX0 + vfmsub.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 +#if !defined(CONJ) + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#endif + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + addi.d I, I, -1 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE +#else + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + addi.d I, I, -1 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L222: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 +#endif + add.d Y, Y, INCY +#if !defined(CONJ) +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmsub.s VX1, VXAR, x1, VX0 + vfmadd.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmadd.s VX1, VXAR, x1, VX0 + vfmsub.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY +#if !defined(CONJ) + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#endif + addi.d I, I, -1 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 +#else + addi.d I, I, -1 + vstelm.w x3, YY, 0 * SIZE, 0 + vstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 1 + vstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 2 + vstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 3 + vstelm.w x4, YY, 1 * SIZE, 3 +#endif + add.d YY, YY, INCY + blt $r0, I, .L222 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + addi.d I, I, -1 +#if !defined(CONJ) + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MSUB s3, ALPHAR, a1, s1 + MADD s4, a2, ALPHAR, s2 + ADD s3, s3, a3 + ADD s4, s4, a4 +#else + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MADD s3, ALPHAR, a1, s1 + MSUB s4, a2, ALPHAR, s2 + ADD s3, s3, a3 + SUB s4, a4, s4 +#endif + ST s3, Y, 0 * SIZE + ST s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/ccopy_lasx.S b/kernel/loongarch64/ccopy_lasx.S new file mode 100644 index 000000000..fbc5d96bc --- /dev/null +++ b/kernel/loongarch64/ccopy_lasx.S @@ -0,0 +1,386 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX2, X, 8 * SIZE + xvld VX3, X, 12 * SIZE + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 4 * SIZE + xvst VX2, Y, 8 * SIZE + xvst VX3, Y, 12 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 8 * SIZE +#endif + addi.d I, I, -1 + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX2, X, 8 * SIZE + xvld VX3, X, 12 * SIZE + xvstelm.d VX0, Y, 0 * SIZE, 0 + xvstelm.d VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0 * SIZE, 2 + xvstelm.d VX0, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0 * SIZE, 0 + xvstelm.d VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0 * SIZE, 2 + xvstelm.d VX1, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.d VX2, Y, 0 * SIZE, 0 + xvstelm.d VX2, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.d VX2, Y, 0 * SIZE, 2 + xvstelm.d VX2, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.d VX3, Y, 0 * SIZE, 0 + xvstelm.d VX3, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.d VX3, Y, 0 * SIZE, 2 + xvstelm.d VX3, Y, 1 * SIZE, 3 +#else + xvld VX1, X, 8 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 0 + xvstelm.w VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0 * SIZE, 2 + xvstelm.w VX0, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0 * SIZE, 4 + xvstelm.w VX0, Y, 1 * SIZE, 5 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0 * SIZE, 6 + xvstelm.w VX0, Y, 1 * SIZE, 7 + add.d Y, Y, INCY + xvstelm.w VX1, Y, 0 * SIZE, 0 + xvstelm.w VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.w VX1, Y, 0 * SIZE, 2 + xvstelm.w VX1, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.w VX1, Y, 0 * SIZE, 4 + xvstelm.w VX1, Y, 1 * SIZE, 5 + add.d Y, Y, INCY + xvstelm.w VX1, Y, 0 * SIZE, 6 + xvstelm.w VX1, Y, 1 * SIZE, 7 +#endif + add.d Y, Y, INCY + addi.d X, X, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 4 * SIZE + xvst VX2, Y, 8 * SIZE + xvst VX3, Y, 12 * SIZE +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 0 + xvinsgr2vr.w VX1, t2, 1 + xvinsgr2vr.w VX1, t3, 2 + xvinsgr2vr.w VX1, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 4 + xvinsgr2vr.w VX1, t2, 5 + xvinsgr2vr.w VX1, t3, 6 + xvinsgr2vr.w VX1, t4, 7 + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 8 * SIZE +#endif + addi.d I, I, -1 + addi.d Y, Y, 16 * SIZE + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d X, X, INCX + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/ccopy_lsx.S b/kernel/loongarch64/ccopy_lsx.S new file mode 100644 index 000000000..4c4d880f1 --- /dev/null +++ b/kernel/loongarch64/ccopy_lsx.S @@ -0,0 +1,411 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11:// INCX==1 and INCY==1 + bge $r0, I, .L112 + .align 3 + +.L111: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX2, X, 4 * SIZE + vld VX3, X, 6 * SIZE + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + vld VX0, X, 8 * SIZE + vld VX1, X, 10 * SIZE + vld VX2, X, 12 * SIZE + vld VX3, X, 14 * SIZE + addi.d I, I, -1 + vst VX0, Y, 8 * SIZE + vst VX1, Y, 10 * SIZE + vst VX2, Y, 12 * SIZE + vst VX3, Y, 14 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX2, X, 8 * SIZE + vld VX3, X, 12 * SIZE + addi.d I, I, -1 + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE + vst VX2, Y, 8 * SIZE + vst VX3, Y, 12 * SIZE +#endif + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + .align 3 + +.L121: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX2, X, 4 * SIZE + vld VX3, X, 6 * SIZE + vstelm.d VX0, Y, 0 * SIZE, 0 + vstelm.d VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0 * SIZE, 0 + vstelm.d VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX2, Y, 0 * SIZE, 0 + vstelm.d VX2, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX3, Y, 0 * SIZE, 0 + vstelm.d VX3, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vld VX0, X, 8 * SIZE + vld VX1, X, 10 * SIZE + vld VX2, X, 12 * SIZE + vld VX3, X, 14 * SIZE + vstelm.d VX0, Y, 0 * SIZE, 0 + vstelm.d VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0 * SIZE, 0 + vstelm.d VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX2, Y, 0 * SIZE, 0 + vstelm.d VX2, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX3, Y, 0 * SIZE, 0 + vstelm.d VX3, Y, 1 * SIZE, 1 +#else + vld VX1, X, 4 * SIZE + vld VX2, X, 8 * SIZE + vld VX3, X, 12 * SIZE + vstelm.w VX0, Y, 0 * SIZE, 0 + vstelm.w VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0 * SIZE, 2 + vstelm.w VX0, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0 * SIZE, 0 + vstelm.w VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0 * SIZE, 2 + vstelm.w VX1, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + vstelm.w VX2, Y, 0 * SIZE, 0 + vstelm.w VX2, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.w VX2, Y, 0 * SIZE, 2 + vstelm.w VX2, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + vstelm.w VX3, Y, 0 * SIZE, 0 + vstelm.w VX3, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.w VX3, Y, 0 * SIZE, 2 + vstelm.w VX3, Y, 1 * SIZE, 3 +#endif + add.d Y, Y, INCY + addi.d X, X, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 4 * SIZE + vst VX1, Y, 6 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 8 * SIZE + vst VX1, Y, 10 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 12 * SIZE + vst VX1, Y, 14 * SIZE +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE + vst VX2, Y, 8 * SIZE + vst VX3, Y, 12 * SIZE +#endif + addi.d Y, Y, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d X, X, INCX + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cdot_lasx.S b/kernel/loongarch64/cdot_lasx.S new file mode 100644 index 000000000..0583e56ea --- /dev/null +++ b/kernel/loongarch64/cdot_lasx.S @@ -0,0 +1,565 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r19 +#define TEMP $r10 +#define t1 $r11 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define res1 $xr16 +#define res2 $xr17 +#define res3 $xr18 +#define res4 $xr19 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define x1 $xr20 +#define x2 $xr21 +#define x3 $xr22 +#define x4 $xr23 + + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v res3, res3, res3 + xvxor.v res4, res4, res4 + bge $r0, N, .L999 + li.d TEMP, 2 * SIZE + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT +#ifdef DOUBLE + srai.d I, N, 2 +#else + srai.d I, N, 3 +#endif + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + .align 3 + +.L111: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmadd.d res1, x1, x3, res1 + xvfmadd.d res2, x2, x3, res2 + xvfmadd.d res3, x1, x4, res3 + xvfmadd.d res4, x2, x4, res4 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmadd.s res1, x1, x3, res1 + xvfmadd.s res2, x2, x3, res2 + xvfmadd.s res3, x1, x4, res3 + xvfmadd.s res4, x2, x4, res4 + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L996 + .align 3 + +.L12: + bge $r0, I, .L997 + .align 3 + +.L121: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 2 + xvinsgr2vr.d x4, t4, 2 + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 1 + xvinsgr2vr.d x4, t2, 1 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + addi.d X, X, 8 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmadd.d res1, x1, x3, res1 + xvfmadd.d res2, x2, x3, res2 + xvfmadd.d res3, x1, x4, res3 + xvfmadd.d res4, x2, x4, res4 +#else + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + xvld VX1, X, 8 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + addi.d X, X, 16 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmadd.s res1, x1, x3, res1 + xvfmadd.s res2, x2, x3, res2 + xvfmadd.s res3, x1, x4, res3 + xvfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L21: + bge $r0, I, .L997 + .align 3 + +.L211: + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 2 + xvinsgr2vr.d x2, t4, 2 + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 1 + xvinsgr2vr.d x2, t2, 1 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + addi.d Y, Y, 8 * SIZE + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmadd.d res1, x1, x3, res1 + xvfmadd.d res2, x2, x3, res2 + xvfmadd.d res3, x1, x4, res3 + xvfmadd.d res4, x2, x4, res4 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + xvld VX3, Y, 8 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + addi.d Y, Y, 8 * SIZE + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmadd.s res1, x1, x3, res1 + xvfmadd.s res2, x2, x3, res2 + xvfmadd.s res3, x1, x4, res3 + xvfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L211 + b .L996 + .align 3 + +.L22: + bge $r0, I, .L997 + .align 3 + +.L222: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + xvfmadd.d res1, x1, x3, res1 + xvfmadd.d res2, x2, x3, res2 + xvfmadd.d res3, x1, x4, res3 + xvfmadd.d res4, x2, x4, res4 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + xvfmadd.s res1, x1, x3, res1 + xvfmadd.s res2, x2, x3, res2 + xvfmadd.s res3, x1, x4, res3 + xvfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L996: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 + xvpickve.d VX1, res2, 1 + xvpickve.d VX2, res2, 2 + xvpickve.d VX3, res2, 3 + xvfadd.d res2, VX1, res2 + xvfadd.d res2, VX2, res2 + xvfadd.d res2, VX3, res2 + xvpickve.d VX1, res3, 1 + xvpickve.d VX2, res3, 2 + xvpickve.d VX3, res3, 3 + xvfadd.d res3, VX1, res3 + xvfadd.d res3, VX2, res3 + xvfadd.d res3, VX3, res3 + xvpickve.d VX1, res4, 1 + xvpickve.d VX2, res4, 2 + xvpickve.d VX3, res4, 3 + xvfadd.d res4, VX1, res4 + xvfadd.d res4, VX2, res4 + xvfadd.d res4, VX3, res4 +#else + xvpickve.w VX0, res1, 1 + xvpickve.w VX1, res1, 2 + xvpickve.w VX2, res1, 3 + xvpickve.w VX3, res1, 4 + xvpickve.w x1, res1, 5 + xvpickve.w x2, res1, 6 + xvpickve.w x3, res1, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvfadd.s res1, x1, res1 + xvfadd.s res1, x2, res1 + xvfadd.s res1, x3, res1 + xvpickve.w VX0, res2, 1 + xvpickve.w VX1, res2, 2 + xvpickve.w VX2, res2, 3 + xvpickve.w VX3, res2, 4 + xvpickve.w x1, res2, 5 + xvpickve.w x2, res2, 6 + xvpickve.w x3, res2, 7 + xvfadd.s res2, VX0, res2 + xvfadd.s res2, VX1, res2 + xvfadd.s res2, VX2, res2 + xvfadd.s res2, VX3, res2 + xvfadd.s res2, x1, res2 + xvfadd.s res2, x2, res2 + xvfadd.s res2, x3, res2 + xvpickve.w VX0, res3, 1 + xvpickve.w VX1, res3, 2 + xvpickve.w VX2, res3, 3 + xvpickve.w VX3, res3, 4 + xvpickve.w x1, res3, 5 + xvpickve.w x2, res3, 6 + xvpickve.w x3, res3, 7 + xvfadd.s res3, VX0, res3 + xvfadd.s res3, VX1, res3 + xvfadd.s res3, VX2, res3 + xvfadd.s res3, VX3, res3 + xvfadd.s res3, x1, res3 + xvfadd.s res3, x2, res3 + xvfadd.s res3, x3, res3 + xvpickve.w VX0, res4, 1 + xvpickve.w VX1, res4, 2 + xvpickve.w VX2, res4, 3 + xvpickve.w VX3, res4, 4 + xvpickve.w x1, res4, 5 + xvpickve.w x2, res4, 6 + xvpickve.w x3, res4, 7 + xvfadd.s res4, VX0, res4 + xvfadd.s res4, VX1, res4 + xvfadd.s res4, VX2, res4 + xvfadd.s res4, VX3, res4 + xvfadd.s res4, x1, res4 + xvfadd.s res4, x2, res4 + xvfadd.s res4, x3, res4 +#endif + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 3 +#else + andi I, N, 7 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + MADD s1, a1, a3, s1 + MADD s2, a2, a3, s2 + MADD s3, a1, a4, s3 + MADD s4, a2, a4, s4 + addi.d I, I, -1 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: +#ifndef CONJ + SUB $f0, s1, s4 + ADD $f1, s3, s2 +#else + ADD $f0, s1, s4 + SUB $f1, s3, s2 +#endif + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cdot_lsx.S b/kernel/loongarch64/cdot_lsx.S new file mode 100644 index 000000000..5feea12be --- /dev/null +++ b/kernel/loongarch64/cdot_lsx.S @@ -0,0 +1,397 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r19 +#define TEMP $r10 +#define t1 $r11 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define res1 $vr16 +#define res2 $vr17 +#define res3 $vr18 +#define res4 $vr19 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define x1 $vr20 +#define x2 $vr21 +#define x3 $vr22 +#define x4 $vr23 + + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v res3, res3, res3 + vxor.v res4, res4, res4 + bge $r0, N, .L999 + li.d TEMP, 2 * SIZE + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT +#ifdef DOUBLE + srai.d I, N, 1 +#else + srai.d I, N, 2 +#endif + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + .align 3 + +.L111: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmadd.d res1, x1, x3, res1 + vfmadd.d res2, x2, x3, res2 + vfmadd.d res3, x1, x4, res3 + vfmadd.d res4, x2, x4, res4 + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmadd.s res1, x1, x3, res1 + vfmadd.s res2, x2, x3, res2 + vfmadd.s res3, x1, x4, res3 + vfmadd.s res4, x2, x4, res4 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L996 + .align 3 + +.L12: + bge $r0, I, .L997 + .align 3 + +.L121: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + addi.d X, X, 4 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmadd.d res1, x1, x3, res1 + vfmadd.d res2, x2, x3, res2 + vfmadd.d res3, x1, x4, res3 + vfmadd.d res4, x2, x4, res4 +#else + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + addi.d X, X, 8 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmadd.s res1, x1, x3, res1 + vfmadd.s res2, x2, x3, res2 + vfmadd.s res3, x1, x4, res3 + vfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L21: + bge $r0, I, .L997 + .align 3 + +.L211: + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX3, Y, 2 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + addi.d Y, Y, 4 * SIZE + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmadd.d res1, x1, x3, res1 + vfmadd.d res2, x2, x3, res2 + vfmadd.d res3, x1, x4, res3 + vfmadd.d res4, x2, x4, res4 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + addi.d Y, Y, 8 * SIZE + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmadd.s res1, x1, x3, res1 + vfmadd.s res2, x2, x3, res2 + vfmadd.s res3, x1, x4, res3 + vfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L211 + b .L996 + .align 3 + +.L22: + bge $r0, I, .L997 + .align 3 + +.L222: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + vfmadd.d res1, x1, x3, res1 + vfmadd.d res2, x2, x3, res2 + vfmadd.d res3, x1, x4, res3 + vfmadd.d res4, x2, x4, res4 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + vfmadd.s res1, x1, x3, res1 + vfmadd.s res2, x2, x3, res2 + vfmadd.s res3, x1, x4, res3 + vfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L996: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + vreplvei.d VX1, res2, 1 + vfadd.d res2, VX1, res2 + vreplvei.d VX1, res3, 1 + vfadd.d res3, VX1, res3 + vreplvei.d VX1, res4, 1 + vfadd.d res4, VX1, res4 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 + vreplvei.w VX1, res2, 1 + vreplvei.w VX2, res2, 2 + vreplvei.w VX3, res2, 3 + vfadd.s res2, VX1, res2 + vfadd.s res2, VX2, res2 + vfadd.s res2, VX3, res2 + vreplvei.w VX1, res3, 1 + vreplvei.w VX2, res3, 2 + vreplvei.w VX3, res3, 3 + vfadd.s res3, VX1, res3 + vfadd.s res3, VX2, res3 + vfadd.s res3, VX3, res3 + vreplvei.w VX1, res4, 1 + vreplvei.w VX2, res4, 2 + vreplvei.w VX3, res4, 3 + vfadd.s res4, VX1, res4 + vfadd.s res4, VX2, res4 + vfadd.s res4, VX3, res4 +#endif + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 1 +#else + andi I, N, 3 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + MADD s1, a1, a3, s1 + MADD s2, a2, a3, s2 + MADD s3, a1, a4, s3 + MADD s4, a2, a4, s4 + addi.d I, I, -1 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: +#ifndef CONJ + SUB $f0, s1, s4 + ADD $f1, s3, s2 +#else + ADD $f0, s1, s4 + SUB $f1, s3, s2 +#endif + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cgemm_kernel_2x2_lasx.S b/kernel/loongarch64/cgemm_kernel_2x2_lasx.S new file mode 100644 index 000000000..e07f7dc64 --- /dev/null +++ b/kernel/loongarch64/cgemm_kernel_2x2_lasx.S @@ -0,0 +1,857 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA_R $f0 // param 4: alphar +#define ALPHA_I $f1 // param 5: alphai +#define A $r7 // param 6: ba +#define B $r8 // param 7: bb +#define C $r9 // param 8: bc +#define LDC $r10 // param 9: ldc + +#if defined (TRMMKERNEL) +#define OFFSET $r11 // param 10: offset +#endif +#define OFF $r26 + +#define I $r12 +#define J $r13 +#define L $r14 +#define TL $r15 +#define A0 $r16 +#define B0 $r17 +#define C0 $r18 +#define C1 $r19 +#define C2 $r20 +#define C3 $r23 +#define T0 $r24 +#define T1 $r25 + +#define a1 $f2 +#define a2 $f3 +#define a3 $f4 +#define a4 $f5 +#define a5 $f6 +#define a6 $f7 +#define a7 $f8 +#define a8 $f9 +#define b1 $f10 +#define b2 $f11 +#define b3 $f12 +#define b4 $f13 +#define b5 $f14 +#define b6 $f15 +#define b7 $f16 +#define b8 $f17 +#define c11 $f18 +#define c12 $f19 +#define c21 $f20 +#define c22 $f21 +#define c31 $f22 +#define c32 $f23 +#define c41 $f24 +#define c42 $f25 + +/* LASX vectors */ +#define U0 $xr30 +#define U1 $xr31 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define U8 $xr8 +#define U9 $xr9 +#define U10 $xr10 +#define U11 $xr11 +#define U12 $xr12 +#define U13 $xr13 +#define U14 $xr14 +#define U15 $xr15 +#define D0 $xr16 +#define D1 $xr17 +#define D2 $xr18 +#define D3 $xr19 +#define D4 $xr20 +#define D5 $xr21 +#define D6 $xr22 +#define D7 $xr23 +#define D8 $xr24 +#define D9 $xr25 +#define D10 $xr26 +#define D11 $xr27 +#define VALPHAR $xr28 +#define VALPHAI $xr29 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define XVMADD1 XVFMADD +#define XVMADD2 XVFMADD +#define XVMADD3 XVNMSUB +#define XVMADD4 XVFMADD + +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VNMSUB +#define VMADD4 VFMADD + +#define XVFADD1 XVFADD +#define XVFADD2 XVFADD +#define XVFADD3 XVFSUB +#define XVFADD4 XVFADD + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define XVMADD1 XVFMADD +#define XVMADD2 XVFMADD +#define XVMADD3 XVFMADD +#define XVMADD4 XVNMSUB + +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VFMADD +#define VMADD4 VNMSUB + +#define XVFADD1 XVFADD +#define XVFADD2 XVFADD +#define XVFADD3 XVFADD +#define XVFADD4 XVFSUB + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define XVMADD1 XVFMADD +#define XVMADD2 XVNMSUB +#define XVMADD3 XVFMADD +#define XVMADD4 XVFMADD + +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VFMADD +#define VMADD4 VFMADD + +#define XVFADD1 XVFADD +#define XVFADD2 XVFSUB +#define XVFADD3 XVFADD +#define XVFADD4 XVFADD + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define XVMADD1 XVFMADD +#define XVMADD2 XVNMSUB +#define XVMADD3 XVNMSUB +#define XVMADD4 XVNMSUB + +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VNMSUB +#define VMADD4 VNMSUB + +#define XVFADD1 XVFADD +#define XVFADD2 XVFSUB +#define XVFADD3 XVFSUB +#define XVFADD4 XVFSUB + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + ST $f23, $sp, 40 + ST $f24, $sp, 48 + ST $f25, $sp, 56 + ST $f26, $sp, 64 + ST $f27, $sp, 72 + ST $f28, $sp, 80 + ST $f29, $sp, 88 + ST $f30, $sp, 96 + ST $f31, $sp, 104 + ST ALPHA_R,$sp, 112 + ST ALPHA_I,$sp, 120 + + xvldrepl.w VALPHAR, $sp, 112 + xvldrepl.w VALPHAI, $sp, 120 + +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, $r0, OFFSET +#else + xor OFF, OFF, OFF +#endif + + slli.d LDC, LDC, 2 + + move J, $r0 + srai.d T0, N, 1 + beq J, T0, .L19 + +.L10: /* for(j=0; j 0) I-- */ + move S1, TS //a_offset1 + add.d S2, TS, TL //a_offset2 + srai.d J, M, 0x02 + add.d TS, TS, T0 + + beq J, ZERO, .L_I3 + +.L_I1: /* if (j > 0) J-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x00 + xvld U2, S2, 0x00 + + xvpermi.q U0, U2, 0x02 + xvpermi.q U2, U1, 0x31 + + xvpermi.d U0, U0, 0xd8 + xvpermi.d U2, U2, 0xd8 + + xvst U0, TD, 0x00 + xvst U2, TD, 0x20 + + addi.d S1, S1, 0x20 // a_offset1 + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_I1 + +.L_I3: + andi J, M, 0x03 + beq J, ZERO, .L_II20 + +.L_II1: /* j = (m & 3) if (j > 0) */ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 + + addi.d J, J, -1 + blt ZERO, J, .L_II1 + +.L_II20: + addi.d I, I, -1 + blt ZERO, I, .L_J1 + +.L_N0: /* if(n&1)*/ + andi I, N, 0x01 + beq ZERO, I, .L_N00 + +.L_N1: + srai.d J, M, 0x02 + beq ZERO, J, .L_N10 + +.L_N11: /* j = (m >> 2) if (j > 0) */ + xvld U0, TS, 0x00 + + xvst U0, TD, 0x00 + + addi.d TS, TS, 0x20 // a_offset + addi.d TD, TD, 0x20 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N11 + +.L_N10: + andi J, M, 0x03 + beq J, ZERO, .L_N00 + +.L_N12: /* j = (m & 3) if (j > 0) */ + fld.s F0, TS, 0x00 + fld.s F1, TS, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + + addi.d TS, TS, 0x08 // a_offset + addi.d TD, TD, 0x08 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N12 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_ncopy_2_lsx.S b/kernel/loongarch64/cgemm_ncopy_2_lsx.S new file mode 100644 index 000000000..1cf4d87dc --- /dev/null +++ b/kernel/loongarch64/cgemm_ncopy_2_lsx.S @@ -0,0 +1,202 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 +#define D8 $vr16 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TD, DST //boffset + move TS, SRC //aoffset + + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 + slli.d T0, TL, 0x01 + + srai.d I, N, 0x01 + beq I, ZERO, .L_N0 + +.L_J1: /* if (i > 0) I-- */ + move S1, TS //a_offset1 + add.d S2, TS, TL //a_offset2 + srai.d J, M, 0x02 + add.d TS, TS, T0 + + beq J, ZERO, .L_I3 + +.L_I1: /* if (j > 0) J-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vand.v D0, U2, U2 + vand.v D1, U3, U3 + vand.v D2, U2, U2 + vand.v D3, U3, U3 + + vpermi.w D0, U0, 0x44 + vpermi.w D2, U0, 0xee + vpermi.w D1, U1, 0x44 + vpermi.w D3, U1, 0xee + + vst D0, TD, 0x00 + vst D2, TD, 0x10 + vst D1, TD, 0x20 + vst D3, TD, 0x30 + + addi.d S1, S1, 0x20 // a_offset1 + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_I1 + +.L_I3: + andi J, M, 0x03 + beq J, ZERO, .L_II20 + +.L_II1: /* j = (m & 3) if (j > 0) */ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 + + addi.d J, J, -1 + blt ZERO, J, .L_II1 + +.L_II20: + addi.d I, I, -1 + blt ZERO, I, .L_J1 + +.L_N0: /* if(n&1)*/ + andi I, N, 0x01 + beq ZERO, I, .L_N00 + +.L_N1: + srai.d J, M, 0x02 + beq ZERO, J, .L_N10 + +.L_N11: /* j = (m >> 2) if (j > 0) */ + vld U0, TS, 0x00 + vld U1, TS, 0x10 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d TS, TS, 0x20 // a_offset + addi.d TD, TD, 0x20 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N11 + +.L_N10: + andi J, M, 0x03 + beq J, ZERO, .L_N00 + +.L_N12: /* j = (m & 3) if (j > 0) */ + fld.s F0, TS, 0x00 + fld.s F1, TS, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + + addi.d TS, TS, 0x08 // a_offset + addi.d TD, TD, 0x08 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N12 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_ncopy_4_lsx.S b/kernel/loongarch64/cgemm_ncopy_4_lsx.S new file mode 100644 index 000000000..bfc712fb2 --- /dev/null +++ b/kernel/loongarch64/cgemm_ncopy_4_lsx.S @@ -0,0 +1,341 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define TD $r20 +#define TS $r11 +#define TL $r19 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 +#define D8 $vr16 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TD, DST //boffset + move TS, SRC //aoffset + + slli.d TL, LDA, 0x02 + slli.d TL, TL, 0x01 + + srai.d J, N, 0x02 + beq J, ZERO, .L_N0 + +.L_J1: /* J-- */ + move S1, TS + add.d S2, S1, TL + add.d S3, S2, TL + add.d S4, S3, TL + + slli.d T0, TL, 0x02 + add.d TS, TS, T0 + + srai.d I, M, 0x02 + beq I, ZERO, .L_I3 + +.L_I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vld U4, S3, 0x00 + vld U5, S3, 0x10 + + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vand.v D0, U2, U2 + vand.v D1, U3, U3 + vand.v D2, U2, U2 + vand.v D3, U3, U3 + vand.v D4, U6, U6 + vand.v D5, U7, U7 + vand.v D6, U6, U6 + vand.v D7, U7, U7 + + vpermi.w D0, U0, 0x44 + vpermi.w D4, U4, 0x44 + vpermi.w D2, U0, 0xee + vpermi.w D6, U4, 0xee + vpermi.w D1, U1, 0x44 + vpermi.w D5, U5, 0x44 + vpermi.w D3, U1, 0xee + vpermi.w D7, U5, 0xee + + vst D0, TD, 0x00 + vst D4, TD, 0x10 + vst D2, TD, 0x20 + vst D6, TD, 0x30 + vst D1, TD, 0x40 + vst D5, TD, 0x50 + vst D3, TD, 0x60 + vst D7, TD, 0x70 + + addi.d S1, S1, 0x20 // a_offset + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d TD, TD, 0x80 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_I1 + +.L_I3: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_II20 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vand.v D0, U1, U1 + vand.v D1, U1, U1 + vand.v D2, U3, U3 + vand.v D3, U3, U3 + + vpermi.w D0, U0, 0x44 + vpermi.w D2, U2, 0x44 + vpermi.w D1, U0, 0xee + vpermi.w D3, U2, 0xee + + vst D0, TD, 0x00 + vst D2, TD, 0x10 + vst D1, TD, 0x20 + vst D3, TD, 0x30 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d TD, TD, 0x40 + +.L_II20: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_J0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fld.s F4, S3, 0x00 + fld.s F5, S3, 0x04 + + fld.s F6, S4, 0x00 + fld.s F7, S4, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + fst.s F4, TD, 0x10 + fst.s F5, TD, 0x14 + fst.s F6, TD, 0x18 + fst.s F7, TD, 0x1c + + addi.d TD, TD, 0x20 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N0: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_N20 + + move S1, TS + add.d S2, S1, TL + + slli.d T0, TL, 0x01 + add.d TS, TS, T0 + + srai.d I, M, 0x02 + beq ZERO, I, .L_N10 + +.L_N11: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vand.v D0, U2, U2 + vand.v D1, U3, U3 + vand.v D2, U2, U2 + vand.v D3, U3, U3 + + vpermi.w D0, U0, 0x44 + vpermi.w D2, U0, 0xee + vpermi.w D1, U1, 0x44 + vpermi.w D3, U1, 0xee + + vst D0, TD, 0x00 + vst D2, TD, 0x10 + vst D1, TD, 0x20 + vst D3, TD, 0x30 + + addi.d S1, S1, 0x20 // a_offset + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N10: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N130 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vand.v D0, U1, U1 + + vpermi.w D0, U0, 0x44 + vpermi.w U1, U0, 0xee + + vst D0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d S1, S1, 0x10 // a_offset + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 // b_offset + +.L_N130: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N20 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + addi.d TD, TD, 0x10 + +.L_N20: /* if(n&1) */ + andi I, N, 0x01 + beq I, ZERO, .L_N00 + + move S1, TS + srai.d I, M, 0x02 + + beq I, ZERO, .L_N30 + +.L_N21: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d S1, S1, 0x20 // aoffset1 + addi.d TD, TD, 0x20 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N30: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N330 + + vld U0, S1, 0x00 + + vst U0, TD, 0x00 + + addi.d S1, S1, 0x10 // aoffset1 + addi.d TD, TD, 0x10 // b_offset + +.L_N330: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N00 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_ncopy_8_lsx.S b/kernel/loongarch64/cgemm_ncopy_8_lsx.S new file mode 100644 index 000000000..87a88e37d --- /dev/null +++ b/kernel/loongarch64/cgemm_ncopy_8_lsx.S @@ -0,0 +1,263 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 +#define D8 $vr16 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TD, DST //boffset + move TS, SRC //aoffset + + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 + + slli.d T0, TL, 0x03 + srai.d J, N, 0x03 //j + + beq J, ZERO, .L_N1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS + add.d S2, TS, TL + move I, M + add.d S3, S2, TL + add.d S4, S3, TL + add.d S5, S4, TL + add.d S6, S5, TL + add.d S7, S6, TL + add.d S8, S7, TL + add.d TS, TS, T0 + + beq I, ZERO, .L_J11 + +.L_I1: /* if(i>0) i--*/ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + fld.s F4, S3, 0x00 + fld.s F5, S3, 0x04 + fld.s F6, S4, 0x00 + fld.s F7, S4, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + fst.s F4, TD, 0x10 + fst.s F5, TD, 0x14 + fst.s F6, TD, 0x18 + fst.s F7, TD, 0x1c + + fld.s F0, S5, 0x00 + fld.s F1, S5, 0x04 + fld.s F2, S6, 0x00 + fld.s F3, S6, 0x04 + fld.s F4, S7, 0x00 + fld.s F5, S7, 0x04 + fld.s F6, S8, 0x00 + fld.s F7, S8, 0x04 + + fst.s F0, TD, 0x20 + fst.s F1, TD, 0x24 + fst.s F2, TD, 0x28 + fst.s F3, TD, 0x2c + fst.s F4, TD, 0x30 + fst.s F5, TD, 0x34 + fst.s F6, TD, 0x38 + fst.s F7, TD, 0x3c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d S5, S5, 0x08 + addi.d S6, S6, 0x08 + addi.d S7, S7, 0x08 + addi.d S8, S8, 0x08 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 + +.L_J11: /* j--*/ + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N1: /* if(n&4)*/ + andi I, N, 0x04 + beq I, ZERO, .L_N2 + + move S1, TS + add.d S2, TS, TL + move I, M + add.d S3, S2, TL + add.d S4, S3, TL + add.d TS, S4, TL + + beq I, ZERO, .L_N2 + +.L_N11: /* if(i>0)*/ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + fld.s F4, S3, 0x00 + fld.s F5, S3, 0x04 + fld.s F6, S4, 0x00 + fld.s F7, S4, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + fst.s F4, TD, 0x10 + fst.s F5, TD, 0x14 + fst.s F6, TD, 0x18 + fst.s F7, TD, 0x1c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N2: /* if(n&2)*/ + andi I, N, 0x02 + beq I, ZERO, .L_N3 + + move S1, TS + add.d S2, TS, TL + move I, M + add.d TS, S2, TL + + beq I, ZERO, .L_N3 + +.L_N21: /* if(i>0)*/ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N3: /* if(n&2)*/ + andi I, N, 0x01 + beq I, ZERO, .L_N0 + + move S1, TS + move I, M + + beq I, ZERO, .L_N0 + +.L_N31: /* if(i>0)*/ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + + addi.d S1, S1, 0x08 + addi.d TD, TD, 0x08 + + addi.d I, I, -1 + blt ZERO, I, .L_N31 + +.L_N0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_2_lasx.S b/kernel/loongarch64/cgemm_tcopy_2_lasx.S new file mode 100644 index 000000000..e2245e412 --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_2_lasx.S @@ -0,0 +1,218 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r7 +#define T0 $r18 +#define S8 $r19 +#define S9 $r20 +#define S10 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S9, DST, T0 //boffset2 + + srai.d J, M, 0x01 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 + add.d S2, S1, TL //aoffset2 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x20 + + srai.d I, N, 0x02 + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + xvld U0, S1, 0x00 + xvld U1, S1, 0x00 + xvld U2, S2, 0x00 + + xvpermi.q U0, U2, 0x02 + xvpermi.q U2, U1, 0x31 + + xvst U0, S8, 0x00 + + slli.d T0, M, 0x04 + add.d S8, S8, T0 + + xvst U2, S8, 0x00 + + add.d S8, S8, T0 + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, S8, 0x00 + vst $vr1, S8, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, S9, 0x00 + fst.s F1, S9, 0x04 + fst.s F2, S9, 0x08 + fst.s F3, S9, 0x0c + + addi.d S9, S9, 0x10 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + vld $vr0, TS, 0x00 + vld $vr1, TS, 0x10 + + vst $vr0, TD, 0x00 + + slli.d T0, M, 0x04 + add.d TD, TD, T0 + + vst $vr1, TD, 0x00 + + add.d TD, TD, T0 + addi.d TS, TS, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + vld $vr0, TS, 0x00 + + vst $vr0, TD, 0x00 + + addi.d TS, TS, 0x10 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.s F0, TS, 0x00 + fld.s F1, TS, 0x04 + + fst.s F0, S9, 0x00 + fst.s F1, S9, 0x04 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_2_lsx.S b/kernel/loongarch64/cgemm_tcopy_2_lsx.S new file mode 100644 index 000000000..15c0fde8f --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_2_lsx.S @@ -0,0 +1,218 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r7 +#define T0 $r18 +#define S8 $r19 +#define S9 $r20 +#define S10 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S9, DST, T0 //boffset2 + + srai.d J, M, 0x01 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 + add.d S2, S1, TL //aoffset2 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x20 + + srai.d I, N, 0x02 + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, S8, 0x00 + vst U2, S8, 0x10 + + slli.d T0, M, 0x04 + add.d S8, S8, T0 + + vst U1, S8, 0x00 + vst U3, S8, 0x10 + + add.d S8, S8, T0 + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, S9, 0x00 + fst.s F1, S9, 0x04 + fst.s F2, S9, 0x08 + fst.s F3, S9, 0x0c + + addi.d S9, S9, 0x10 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + vld U0, TS, 0x00 + vld U1, TS, 0x10 + + vst U0, TD, 0x00 + + slli.d T0, M, 0x04 + add.d TD, TD, T0 + + vst U1, TD, 0x00 + + add.d TD, TD, T0 + addi.d TS, TS, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + vld U0, TS, 0x00 + + vst U0, TD, 0x00 + + addi.d TS, TS, 0x10 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.s F0, TS, 0x00 + fld.s F1, TS, 0x04 + + fst.s F0, S9, 0x00 + fst.s F1, S9, 0x04 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_4_lsx.S b/kernel/loongarch64/cgemm_tcopy_4_lsx.S new file mode 100644 index 000000000..6d63d62e7 --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_4_lsx.S @@ -0,0 +1,324 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r18 +#define T0 $r19 +#define S8 $r20 +#define S9 $r23 +#define S10 $r11 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 //lda + + ori T0, ZERO, 0x03 + andn T0, N, T0 + mul.w T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S9, DST, T0 //boffset2 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.w T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S10, DST, T0 //boffset3 + + srai.d J, M, 0x02 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + add.d S2, S1, TL + add.d S3, S2, TL + add.d S4, S3, TL + + slli.d T0, TL, 0x02 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x80 + + srai.d I, N, 0x02 + + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vld U4, S3, 0x00 + vld U5, S3, 0x10 + + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + vst U2, S8, 0x20 + vst U3, S8, 0x30 + vst U4, S8, 0x40 + vst U5, S8, 0x50 + vst U6, S8, 0x60 + vst U7, S8, 0x70 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vst U0, S9, 0x00 + vst U1, S9, 0x10 + vst U2, S9, 0x20 + vst U3, S9, 0x30 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d S9, S9, 0x40 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fld.s F4, S3, 0x00 + fld.s F5, S3, 0x04 + + fld.s F6, S4, 0x00 + fld.s F7, S4, 0x04 + + fst.s F0, S10, 0x00 + fst.s F1, S10, 0x04 + fst.s F2, S10, 0x08 + fst.s F3, S10, 0x0c + fst.s F4, S10, 0x10 + fst.s F5, S10, 0x14 + fst.s F6, S10, 0x18 + fst.s F7, S10, 0x1c + + addi.d S10, S10, 0x20 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&2) */ + andi I, M, 0x02 + beq ZERO, I, .L_M2 + + move S1, TS //aoffset1 + add.d S2, S1, TL + + slli.d T0, TL, 0x01 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x40 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + vst U2, S8, 0x20 + vst U3, S8, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, S9, 0x00 + vst U1, S9, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S9, S9, 0x20 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M2 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, S10, 0x00 + fst.s F1, S10, 0x04 + fst.s F2, S10, 0x08 + fst.s F3, S10, 0x0c + + addi.d S10, S10, 0x10 + +.L_M2: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + move S1, TS //aoffset1 + move S8, TD //boffset1 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M2N1 + +.L_M2I1: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + + addi.d S1, S1, 0x20 + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M2I1 + +.L_M2N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M2N2 + + vld U0, S1, 0x00 + + vst U0, S9, 0x00 + + addi.d S1, S1, 0x10 + +.L_M2N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fst.s F0, S10, 0x00 + fst.s F1, S10, 0x04 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_8_lsx.S b/kernel/loongarch64/cgemm_tcopy_8_lsx.S new file mode 100644 index 000000000..2935bbc07 --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_8_lsx.S @@ -0,0 +1,277 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 + + srai.d J, N, 0x03 //j + + beq J, ZERO, .L_N1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x40 + + srai.d I, M, 0x01 + beq ZERO, I, .L_J1M1 + +.L_J1I1: /* if(i>0) i--*/ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + vst U2, TD, 0x20 + vst U3, TD, 0x30 + vst U4, TD, 0x40 + vst U5, TD, 0x50 + vst U6, TD, 0x60 + vst U7, TD, 0x70 + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x80 + + addi.d I, I, -1 + blt ZERO, I, .L_J1I1 + +.L_J1M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_J0 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + vst U2, TD, 0x20 + vst U3, TD, 0x30 + + addi.d TD, TD, 0x40 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N1: /* if(n&4) */ + andi I, N, 0x04 + beq ZERO, I, .L_N2 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x20 + + srai.d I, M, 0x01 + beq ZERO, I, .L_N1M1 + +.L_N1I1: /* if(i>0) i-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + vst U2, TD, 0x20 + vst U3, TD, 0x30 + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_N1I1 + +.L_N1M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N2 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d TD, TD, 0x20 + +.L_N2: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_N3 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x10 + + srai.d I, M, 0x01 + beq ZERO, I, .L_N2M1 + +.L_N2I1: /* if(i>0) i-- */ + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + add.d S1, S1, T0 + add.d S2, S2, T0 + + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_N2I1 + +.L_N2M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N3 + + vld U0, S1, 0x00 + + vst U0, TD, 0x00 + + addi.d TD, TD, 0x10 + +.L_N3: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + + srai.d I, M, 0x01 + beq ZERO, I, .L_N3M1 + +.L_N3I1: /* if(i>0) i-- */ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x10 + + addi.d I, I, -1 + blt ZERO, I, .L_N3I1 + +.L_N3M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + +.L_N0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cnrm2_lasx.S b/kernel/loongarch64/cnrm2_lasx.S new file mode 100644 index 000000000..3a60069ac --- /dev/null +++ b/kernel/loongarch64/cnrm2_lasx.S @@ -0,0 +1,147 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define a1 $f15 +#define a2 $f16 +#define res $f19 +#define VX0 $xr15 +#define VX1 $xr16 +#define VX2 $xr17 +#define VX3 $xr18 +#define VX4 $xr21 +#define res1 $xr19 +#define res2 $xr20 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + li.d TEMP, SIZE + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L997 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvfcvtl.d.s VX1, VX0 + xvfcvth.d.s VX2, VX0 + xvfmadd.d res1, VX1, VX1, res1 + xvfmadd.d res2, VX2, VX2, res2 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + .align 3 + b .L996 + +.L20: + bge $r0, I, .L997 + .align 3 + +.L21: + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfcvtl.d.s VX1, VX0 + xvfcvth.d.s VX2, VX0 + xvfmadd.d res1, VX1, VX1, res1 + xvfmadd.d res2, VX2, VX2, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L996 + +.L996: + xvfadd.d res1, res1, res2 + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.s a1, X, 0 * SIZE + fld.s a2, X, 1 * SIZE + addi.d I, I, -1 + fcvt.d.s a1, a1 + fcvt.d.s a2, a2 + fmadd.d res, a1, a1, res + fmadd.d res, a2, a2, res + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d res, res + move $r4, $r17 + fcvt.s.d $f0, res + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/cnrm2_lsx.S b/kernel/loongarch64/cnrm2_lsx.S new file mode 100644 index 000000000..20950ba17 --- /dev/null +++ b/kernel/loongarch64/cnrm2_lsx.S @@ -0,0 +1,155 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define a1 $f15 +#define a2 $f16 +#define res $f19 +#define VX0 $vr15 +#define VX1 $vr16 +#define VX2 $vr17 +#define VX3 $vr18 +#define VX4 $vr21 +#define res1 $vr19 +#define res2 $vr20 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L997 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vfcvtl.d.s VX1, VX0 + vfcvth.d.s VX2, VX0 + vfmadd.d res1, VX1, VX1, res1 + vfmadd.d res2, VX2, VX2, res2 + vld VX0, X, 4 * SIZE + vfcvtl.d.s VX3, VX0 + vfcvth.d.s VX4, VX0 + vfmadd.d res1, VX3, VX3, res1 + vfmadd.d res2, VX4, VX4, res2 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + b .L996 + .align 3 + +.L20: + bge $r0, I, .L997 + .align 3 + +.L21: + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfcvtl.d.s VX1, VX0 + vfcvth.d.s VX2, VX0 + vfmadd.d res1, VX1, VX1, res1 + vfmadd.d res2, VX2, VX2, res2 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfcvtl.d.s VX3, VX0 + vfcvth.d.s VX4, VX0 + vfmadd.d res1, VX3, VX3, res1 + vfmadd.d res2, VX4, VX4, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L996 + .align 3 + +.L996: + vfadd.d res1, res1, res2 + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.s a1, X, 0 * SIZE + fld.s a2, X, 1 * SIZE + addi.d I, I, -1 + fcvt.d.s a1, a1 + fcvt.d.s a2, a2 + fmadd.d res, a1, a1, res + fmadd.d res, a2, a2, res + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d res, res + move $r4, $r17 + fcvt.s.d $f0, $f19 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/copy_lasx.S b/kernel/loongarch64/copy_lasx.S new file mode 100644 index 000000000..31f91cec1 --- /dev/null +++ b/kernel/loongarch64/copy_lasx.S @@ -0,0 +1,306 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $xr12 +#define VX1 $xr13 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +/* INCX==1 and INCY==1 */ +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 + addi.d I, I, -1 + xvst VX0, Y, 0 +#ifdef DOUBLE + xvld VX0, X, 32 + xvst VX0, Y, 32 +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD $f12, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + ST $f12, Y, 0 + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +/* INCX==1 and INCY!=1 */ +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + xvld VX0, X, 0 + xvld VX1, X, 32 + xvstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0, 2 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0, 3 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 2 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 3 + add.d Y, Y, INCY +#else + xvld VX0, X, 0 + xvstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 3 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 4 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 5 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 6 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 7 + add.d Y, Y, INCY +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD $f12, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + ST $f12, Y, 0 + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +/* INCX!=1 and INCY==1 */ +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvst VX0, Y, 0 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvst VX1, Y, 32 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvst VX0, Y, 0 +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD $f12, X, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +/* INCX!=1 and INCY!=1 */ +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + ST a1, Y, 0 + add.d Y, Y, INCY + ST a2, Y, 0 + add.d Y, Y, INCY + ST a3, X, 0 + add.d Y, Y, INCY + ST a4, X, 0 + add.d Y, Y, INCY + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + ST a1, Y, 0 + add.d Y, Y, INCY + ST a2, Y, 0 + add.d Y, Y, INCY + ST a3, X, 0 + add.d Y, Y, INCY + ST a4, X, 0 + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD $f12, X, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/copy_lsx.S b/kernel/loongarch64/copy_lsx.S new file mode 100644 index 000000000..bb10f3565 --- /dev/null +++ b/kernel/loongarch64/copy_lsx.S @@ -0,0 +1,316 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $vr12 +#define VX1 $vr13 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +/* INCX==1 and INCY==1 */ +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + vld VX0, X, 0 + vld VX1, X, 16 + addi.d I, I, -1 + vst VX0, Y, 0 + vst VX1, Y, 16 +#ifdef DOUBLE + vld VX0, X, 32 + vld VX1, X, 48 + vst VX0, Y, 32 + vst VX1, Y, 48 +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD $f12, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + ST $f12, Y, 0 + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +/* INCX==1 and INCY!=1 */ +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + vld VX0, X, 0 + vld VX1, X, 16 + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY + vld VX0, X, 32 + vld VX1, X, 48 + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY +#else + vld VX0, X, 0 + vld VX1, X, 16 + vstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0, 3 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 0 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 1 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 2 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 3 + add.d Y, Y, INCY +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD $f12, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + ST $f12, Y, 0 + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +/* INCX!=1 and INCY==1 */ +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 0 + vst VX1, Y, 16 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 32 + vst VX1, Y, 48 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vst VX0, Y, 0 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vst VX1, Y, 16 +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD $f12, X, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +/* INCX!=1 and INCY!=1 */ +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + ST a1, Y, 0 + add.d Y, Y, INCY + ST a2, Y, 0 + add.d Y, Y, INCY + ST a3, X, 0 + add.d Y, Y, INCY + ST a4, X, 0 + add.d Y, Y, INCY + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + ST a1, Y, 0 + add.d Y, Y, INCY + ST a2, Y, 0 + add.d Y, Y, INCY + ST a3, X, 0 + add.d Y, Y, INCY + ST a4, X, 0 + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD $f12, X, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/crot_lasx.S b/kernel/loongarch64/crot_lasx.S new file mode 100644 index 000000000..d4ec1e22c --- /dev/null +++ b/kernel/loongarch64/crot_lasx.S @@ -0,0 +1,1079 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define C $f0 +#define S $f1 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VT0 $xr10 +#define VT1 $xr18 +#define VXC $xr23 +#define VXS $xr9 +#define VXZ $xr11 +#define x1 $xr12 +#define x2 $xr13 +#define x3 $xr14 +#define x4 $xr15 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + MTG t1, C + MTG t2, S + MTG t3, a1 +#ifdef DOUBLE + xvreplgr2vr.d VXC, t1 + xvreplgr2vr.d VXS, t2 + xvreplgr2vr.d VXZ, t3 + srai.d I, N, 2 +#else + xvreplgr2vr.w VXC, t1 + xvreplgr2vr.w VXS, t2 + xvreplgr2vr.w VXZ, t3 + srai.d I, N, 3 +#endif + beq INCX, $r0, .L996 + beq INCY, $r0, .L996 + bne INCX, TEMP, .L22 // INCX!=1 or INCY!=1 + bne INCY, TEMP, .L22 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L110 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L112 // C!=0 S==0 + b .L111 // C!=0 S!=0 + .align 3 + +.L110: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L114 // C==0 S==0 + b .L113 // C==0 S!=0 + .align 3 + +.L111: // C!=0 S!=0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmul.d VX0, x1, VXC + xvfmadd.d VX0, x3, VXS, VX0 + xvfmul.d VX1, x1, VXS + xvfmsub.d VX1, x3, VXC, VX1 + xvfmul.d VX2, x2, VXC + xvfmadd.d VX2, x4, VXS, VX2 + xvfmul.d VX3, x2, VXS + xvfmsub.d VX3, x4, VXC, VX3 + xvilvl.d x1, VX2 ,VX0 + xvilvh.d x2, VX2, VX0 + xvilvl.d x3, VX3 ,VX1 + xvilvh.d x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 4 * SIZE + xvst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmul.s VX0, x1, VXC + xvfmadd.s VX0, x3, VXS, VX0 + xvfmul.s VX1, x1, VXS + xvfmsub.s VX1, x3, VXC, VX1 + xvfmul.s VX2, x2, VXC + xvfmadd.s VX2, x4, VXS, VX2 + xvfmul.s VX3, x2, VXS + xvfmsub.s VX3, x4, VXC, VX3 + xvilvl.w x1, VX2 ,VX0 + xvilvh.w x2, VX2, VX0 + xvilvl.w x3, VX3 ,VX1 + xvilvh.w x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 8 * SIZE + xvst x4, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // C!=0 S==0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmul.d VX0, x1, VXC + xvfmul.d VX1, x3, VXC + xvfmul.d VX2, x2, VXC + xvfmul.d VX3, x4, VXC + xvilvl.d x1, VX2 ,VX0 + xvilvh.d x2, VX2, VX0 + xvilvl.d x3, VX3 ,VX1 + xvilvh.d x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 4 * SIZE + xvst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmul.s VX0, x1, VXC + xvfmul.s VX1, x3, VXC + xvfmul.s VX2, x2, VXC + xvfmul.s VX3, x4, VXC + xvilvl.w x1, VX2 ,VX0 + xvilvh.w x2, VX2, VX0 + xvilvl.w x3, VX3 ,VX1 + xvilvh.w x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 8 * SIZE + xvst x4, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // C==0 S!=0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmul.d VX0, x3, VXS + xvfmul.d VX1, x1, VXS + xvfsub.d VX1, VXZ, VX1 + xvfmul.d VX2, x4, VXS + xvfmul.d VX3, x2, VXS + xvfsub.d VX3, VXZ, VX3 + xvilvl.d x1, VX2 ,VX0 + xvilvh.d x2, VX2, VX0 + xvilvl.d x3, VX3 ,VX1 + xvilvh.d x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 4 * SIZE + xvst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmul.s VX0, x3, VXS + xvfmul.s VX1, x1, VXS + xvfsub.s VX1, VXZ, VX1 + xvfmul.s VX2, x4, VXS + xvfmul.s VX3, x2, VXS + xvfsub.s VX3, VXZ, VX3 + xvilvl.w x1, VX2 ,VX0 + xvilvh.w x2, VX2, VX0 + xvilvl.w x3, VX3 ,VX1 + xvilvh.w x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 8 * SIZE + xvst x4, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // C==0 S==0 + xvst VXZ, X, 0 * SIZE + xvst VXZ, Y, 0 * SIZE +#ifdef DOUBLE + xvst VXZ, X, 4 * SIZE + xvst VXZ, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvst VXZ, X, 8 * SIZE + xvst VXZ, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L220 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L222 // C!=0 S==0 + b .L221 // C!=0 S!=0 + .align 3 + +.L220: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L224 // C==0 S==0 + b .L223 // C==0 S!=0 + .align 3 + +.L221: // C!=0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + + xvfmul.d VX0, x1, VXC + xvfmadd.d VX0, x3, VXS, VX0 + xvfmul.d VX1, x1, VXS + xvfmsub.d VX1, x3, VXC, VX1 + xvfmul.d VX2, x2, VXC + xvfmadd.d VX2, x4, VXS, VX2 + xvfmul.d VX3, x2, VXS + xvfmsub.d VX3, x4, VXC, VX3 + xvstelm.d VX0, XX, 0, 0 + xvstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 1 + xvstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 2 + xvstelm.d VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 3 + xvstelm.d VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.d VX1, YY, 0, 0 + xvstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 1 + xvstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 2 + xvstelm.d VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 3 + xvstelm.d VX3, YY, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + + xvfmul.s VX0, x1, VXC + xvfmadd.s VX0, x3, VXS, VX0 + xvfmul.s VX1, x1, VXS + xvfmsub.s VX1, x3, VXC, VX1 + xvfmul.s VX2, x2, VXC + xvfmadd.s VX2, x4, VXS, VX2 + xvfmul.s VX3, x2, VXS + xvfmsub.s VX3, x4, VXC, VX3 + xvstelm.w VX0, XX, 0, 0 + xvstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 1 + xvstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 2 + xvstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 3 + xvstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 0 + xvstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 1 + xvstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 2 + xvstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 3 + xvstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w VX0, XX, 0, 4 + xvstelm.w VX2, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 5 + xvstelm.w VX2, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 6 + xvstelm.w VX2, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 7 + xvstelm.w VX2, XX, 1 * SIZE, 7 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 4 + xvstelm.w VX3, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 5 + xvstelm.w VX3, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 6 + xvstelm.w VX3, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 7 + xvstelm.w VX3, YY, 1 * SIZE, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // C!=0 S==0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX0, x1, VXC + xvfmul.d VX1, x3, VXC + xvfmul.d VX2, x2, VXC + xvfmul.d VX3, x4, VXC + xvstelm.d VX0, XX, 0, 0 + xvstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 1 + xvstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 2 + xvstelm.d VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 3 + xvstelm.d VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.d VX1, YY, 0, 0 + xvstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 1 + xvstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 2 + xvstelm.d VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 3 + xvstelm.d VX3, YY, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, x1, VXC + xvfmul.s VX1, x3, VXC + xvfmul.s VX2, x2, VXC + xvfmul.s VX3, x4, VXC + xvstelm.w VX0, XX, 0, 0 + xvstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 1 + xvstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 2 + xvstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 3 + xvstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 0 + xvstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 1 + xvstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 2 + xvstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 3 + xvstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w VX0, XX, 0, 4 + xvstelm.w VX2, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 5 + xvstelm.w VX2, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 6 + xvstelm.w VX2, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 7 + xvstelm.w VX2, XX, 1 * SIZE, 7 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 4 + xvstelm.w VX3, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 5 + xvstelm.w VX3, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 6 + xvstelm.w VX3, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 7 + xvstelm.w VX3, YY, 1 * SIZE, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // C==0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX0, x3, VXS + xvfmul.d VX1, x1, VXS + xvfsub.d VX1, VXZ, VX1 + xvfmul.d VX2, x4, VXS + xvfmul.d VX3, x2, VXS + xvfsub.d VX3, VXZ, VX3 + xvstelm.d VX0, XX, 0, 0 + xvstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 1 + xvstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 2 + xvstelm.d VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 3 + xvstelm.d VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.d VX1, YY, 0, 0 + xvstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 1 + xvstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 2 + xvstelm.d VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 3 + xvstelm.d VX3, YY, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, x3, VXS + xvfmul.s VX1, x1, VXS + xvfsub.s VX1, VXZ, VX1 + xvfmul.s VX2, x4, VXS + xvfmul.s VX3, x2, VXS + xvfsub.s VX3, VXZ, VX3 + xvstelm.w VX0, XX, 0, 0 + xvstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 1 + xvstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 2 + xvstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 3 + xvstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 0 + xvstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 1 + xvstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 2 + xvstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 3 + xvstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w VX0, XX, 0, 4 + xvstelm.w VX2, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 5 + xvstelm.w VX2, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 6 + xvstelm.w VX2, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 7 + xvstelm.w VX2, XX, 1 * SIZE, 7 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 4 + xvstelm.w VX3, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 5 + xvstelm.w VX3, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 6 + xvstelm.w VX3, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 7 + xvstelm.w VX3, YY, 1 * SIZE, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // C==0 S==0 +#ifdef DOUBLE + xvstelm.d VXZ, XX, 0, 0 + xvstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 0 + xvstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 0 + xvstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 0 + xvstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, YY, 0, 0 + xvstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + xvstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + xvstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + xvstelm.d VXZ, YY, 1 * SIZE, 0 +#else + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + move X, XX + move Y, YY + b .L997 + .align 3 + +.L996: + move I, N + b .L998 + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 3 +#else + andi I, N, 7 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + MUL s1, a1, C + MADD s1, a3, S, s1 + MUL s2, a1, S + MSUB s2, a3, C, s2 + MUL s3, a2, C + MADD s3, a4, S, s3 + MUL s4, a2, S + MSUB s4, a4, C, s4 + addi.d I, I, -1 + ST s1, X, 0 * SIZE + ST s2, Y, 0 * SIZE + ST s3, X, 1 * SIZE + ST s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/crot_lsx.S b/kernel/loongarch64/crot_lsx.S new file mode 100644 index 000000000..126257edc --- /dev/null +++ b/kernel/loongarch64/crot_lsx.S @@ -0,0 +1,907 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define C $f0 +#define S $f1 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VT0 $vr10 +#define VT1 $vr18 +#define VXC $vr23 +#define VXS $vr9 +#define VXZ $vr11 +#define x1 $vr12 +#define x2 $vr13 +#define x3 $vr14 +#define x4 $vr15 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + MTG t1, C + MTG t2, S + MTG t3, a1 +#ifdef DOUBLE + vreplgr2vr.d VXC, t1 + vreplgr2vr.d VXS, t2 + vreplgr2vr.d VXZ, t3 +#else + vreplgr2vr.w VXC, t1 + vreplgr2vr.w VXS, t2 + vreplgr2vr.w VXZ, t3 + srai.d I, N, 2 +#endif + beq INCX, $r0, .L996 + beq INCY, $r0, .L996 + bne INCX, TEMP, .L22 // INCX!=1 or INCY!=1 + bne INCY, TEMP, .L22 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L110 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L112 // C!=0 S==0 + b .L111 // C!=0 S!=0 + .align 3 + +.L110: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L114 // C==0 S==0 + b .L113 // C==0 S!=0 + .align 3 + +.L111: // C!=0 S!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, x1, VXC + vfmadd.d VX0, x3, VXS, VX0 + vfmul.d VX1, x1, VXS + vfmsub.d VX1, x3, VXC, VX1 + vfmul.d VX2, x2, VXC + vfmadd.d VX2, x4, VXS, VX2 + vfmul.d VX3, x2, VXS + vfmsub.d VX3, x4, VXC, VX3 + vilvl.d x1, VX2 ,VX0 + vilvh.d x2, VX2, VX0 + vilvl.d x3, VX3 ,VX1 + vilvh.d x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 2 * SIZE + vst x4, Y, 2 * SIZE + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmul.s VX0, x1, VXC + vfmadd.s VX0, x3, VXS, VX0 + vfmul.s VX1, x1, VXS + vfmsub.s VX1, x3, VXC, VX1 + vfmul.s VX2, x2, VXC + vfmadd.s VX2, x4, VXS, VX2 + vfmul.s VX3, x2, VXS + vfmsub.s VX3, x4, VXC, VX3 + vilvl.w x1, VX2 ,VX0 + vilvh.w x2, VX2, VX0 + vilvl.w x3, VX3 ,VX1 + vilvh.w x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 4 * SIZE + vst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // C!=0 S==0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, x1, VXC + vfmul.d VX1, x3, VXC + vfmul.d VX2, x2, VXC + vfmul.d VX3, x4, VXC + vilvl.d x1, VX2 ,VX0 + vilvh.d x2, VX2, VX0 + vilvl.d x3, VX3 ,VX1 + vilvh.d x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 2 * SIZE + vst x4, Y, 2 * SIZE + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmul.s VX0, x1, VXC + vfmul.s VX1, x3, VXC + vfmul.s VX2, x2, VXC + vfmul.s VX3, x4, VXC + vilvl.w x1, VX2 ,VX0 + vilvh.w x2, VX2, VX0 + vilvl.w x3, VX3 ,VX1 + vilvh.w x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 4 * SIZE + vst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // C==0 S!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, x3, VXS + vfmul.d VX1, x1, VXS + vfsub.d VX1, VXZ, VX1 + vfmul.d VX2, x4, VXS + vfmul.d VX3, x2, VXS + vfsub.d VX3, VXZ, VX3 + vilvl.d x1, VX2 ,VX0 + vilvh.d x2, VX2, VX0 + vilvl.d x3, VX3 ,VX1 + vilvh.d x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 2 * SIZE + vst x4, Y, 2 * SIZE + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmul.s VX0, x3, VXS + vfmul.s VX1, x1, VXS + vfsub.s VX1, VXZ, VX1 + vfmul.s VX2, x4, VXS + vfmul.s VX3, x2, VXS + vfsub.s VX3, VXZ, VX3 + vilvl.w x1, VX2 ,VX0 + vilvh.w x2, VX2, VX0 + vilvl.w x3, VX3 ,VX1 + vilvh.w x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 4 * SIZE + vst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // C==0 S==0 + vst VXZ, X, 0 * SIZE + vst VXZ, Y, 0 * SIZE +#ifdef DOUBLE + vst VXZ, X, 2 * SIZE + vst VXZ, Y, 2 * SIZE + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vst VXZ, X, 4 * SIZE + vst VXZ, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L22: +#ifdef DOUBLE + srai.d I, N, 2 +#endif + bge $r0, I, .L997 + move YY, Y + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L220 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L222 // C!=0 S==0 + b .L221 // C!=0 S!=0 + .align 3 + +.L220: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L224 // C==0 S==0 + b .L223 // C==0 S!=0 + .align 3 + +.L221: // C!=0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x1, VXC + vfmadd.d VX0, x3, VXS, VX0 + vfmul.d VX1, x1, VXS + vfmsub.d VX1, x3, VXC, VX1 + vfmul.d VX2, x2, VXC + vfmadd.d VX2, x4, VXS, VX2 + vfmul.d VX3, x2, VXS + vfmsub.d VX3, x4, VXC, VX3 + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x1, VXC + vfmadd.d VX0, x3, VXS, VX0 + vfmul.d VX1, x1, VXS + vfmsub.d VX1, x3, VXC, VX1 + vfmul.d VX2, x2, VXC + vfmadd.d VX2, x4, VXS, VX2 + vfmul.d VX3, x2, VXS + vfmsub.d VX3, x4, VXC, VX3 + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L995 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + + vfmul.s VX0, x1, VXC + vfmadd.s VX0, x3, VXS, VX0 + vfmul.s VX1, x1, VXS + vfmsub.s VX1, x3, VXC, VX1 + vfmul.s VX2, x2, VXC + vfmadd.s VX2, x4, VXS, VX2 + vfmul.s VX3, x2, VXS + vfmsub.s VX3, x4, VXC, VX3 + vstelm.w VX0, XX, 0, 0 + vstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 1 + vstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 2 + vstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 3 + vstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + vstelm.w VX1, YY, 0, 0 + vstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + vstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + vstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 + vstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 +#endif + .align 3 + +.L222: // C!=0 S==0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x1, VXC + vfmul.d VX1, x3, VXC + vfmul.d VX2, x2, VXC + vfmul.d VX3, x4, VXC + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x1, VXC + vfmul.d VX1, x3, VXC + vfmul.d VX2, x2, VXC + vfmul.d VX3, x4, VXC + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L995 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, x1, VXC + vfmul.s VX1, x3, VXC + vfmul.s VX2, x2, VXC + vfmul.s VX3, x4, VXC + vstelm.w VX0, XX, 0, 0 + vstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 1 + vstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 2 + vstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 3 + vstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + vstelm.w VX1, YY, 0, 0 + vstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + vstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + vstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 + vstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L997 +#endif + .align 3 + +.L223: // C==0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x3, VXS + vfmul.d VX1, x1, VXS + vfsub.d VX1, VXZ, VX1 + vfmul.d VX2, x4, VXS + vfmul.d VX3, x2, VXS + vfsub.d VX3, VXZ, VX3 + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x3, VXS + vfmul.d VX1, x1, VXS + vfsub.d VX1, VXZ, VX1 + vfmul.d VX2, x4, VXS + vfmul.d VX3, x2, VXS + vfsub.d VX3, VXZ, VX3 + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L995 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, x3, VXS + vfmul.s VX1, x1, VXS + vfsub.s VX1, VXZ, VX1 + vfmul.s VX2, x4, VXS + vfmul.s VX3, x2, VXS + vfsub.s VX3, VXZ, VX3 + vstelm.w VX0, XX, 0, 0 + vstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 1 + vstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 2 + vstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 3 + vstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + vstelm.w VX1, YY, 0, 0 + vstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + vstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + vstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 + vstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L997 +#endif + .align 3 + +.L224: // C==0 S==0 +#ifdef DOUBLE + vstelm.d VXZ, XX, 0, 0 + vstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + vstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + vstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + vstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VXZ, YY, 0, 0 + vstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + vstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + vstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + vstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + move X, XX + move Y, YY + b .L995 +#else + vstelm.w VXZ, XX, 0, 0 + vstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 0 + vstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 0 + vstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 0 + vstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VXZ, YY, 0, 0 + vstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + vstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + vstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + vstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + move X, XX + move Y, YY + b .L997 +#endif + .align 3 + +#ifdef DOUBLE + .L995: + andi I, N, 3 + bge $r0, I, .L999 + b .L998 + .align 3 + +#endif +.L996: + move I, N + b .L998 + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 1 +#else + andi I, N, 3 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + MUL s1, a1, C + MADD s1, a3, S, s1 + MUL s2, a1, S + MSUB s2, a3, C, s2 + MUL s3, a2, C + MADD s3, a4, S, s3 + MUL s4, a2, S + MSUB s4, a4, C, s4 + addi.d I, I, -1 + ST s1, X, 0 * SIZE + ST s2, Y, 0 * SIZE + ST s3, X, 1 * SIZE + ST s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cscal_lasx.S b/kernel/loongarch64/cscal_lasx.S new file mode 100644 index 000000000..f53526663 --- /dev/null +++ b/kernel/loongarch64/cscal_lasx.S @@ -0,0 +1,500 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r7 +#define INCX $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXAR $xr23 +#define VXAI $xr19 +#define VXZ $xr12 +#define x1 $xr18 +#define x2 $xr17 +#define x3 $xr16 +#define x4 $xr15 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + MTG t1, ALPHAR +#ifdef DOUBLE + xvreplgr2vr.d VXAR, t1 + movfr2gr.d t2, ALPHAI + xvreplgr2vr.d VXAI, t2 + xvxor.v VXZ, VXZ, VXZ + srai.d I, N, 2 +#else + xvreplgr2vr.w VXAR, t1 + movfr2gr.s t2, ALPHAI + xvreplgr2vr.w VXAI, t2 + xvxor.v VXZ, VXZ, VXZ + srai.d I, N, 3 +#endif + bne INCX, TEMP, .L22 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L13 + b .L14 + .align 3 + +.L13: + bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0 + b .L113 //alpha_r != 0.0 && alpha_i == 0.0 + +.L14: + bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0 + b .L111 //alpha_r == 0.0 && alpha_i == 0.0 + .align 3 + +.L111: //alpha_r == 0.0 && alpha_i == 0.0 + xvst VXZ, X, 0 * SIZE +#ifdef DOUBLE + xvst VXZ, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#else + xvst VXZ, X, 8 * SIZE + addi.d X, X, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L113: //alpha_r != 0.0 && alpha_i == 0.0 + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d x3, VXAR, x1 + xvfmul.d x4, VXAR, x2 + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, VXAR, x1 + xvfmul.s x4, VXAR, x2 + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 8 * SIZE + addi.d X, X, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: //alpha_r != 0.0 && alpha_i != 0.0 + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d VX0, VXAI, x2 + xvfmsub.d x3, VXAR, x1, VX0 + xvfmul.d VX1, VXAI, x1 + xvfmadd.d x4, VXAR, x2, VX1 + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s VX0, VXAI, x2 + xvfmsub.s x3, VXAR, x1, VX0 + xvfmul.s VX1, VXAI, x1 + xvfmadd.s x4, VXAR, x2, VX1 + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 8 * SIZE + addi.d X, X, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move XX, X + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L23 + b .L24 + .align 3 + +.L23: + bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0 + b .L223 //alpha_r != 0.0 && alpha_i == 0.0 + +.L24: + bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0 + b .L221 //alpha_r == 0.0 && alpha_i == 0.0 + .align 3 + +.L221: //alpha_r == 0.0 && alpha_i == 0.0 +#ifdef DOUBLE + xvstelm.d VXZ, X, 0, 0 + xvstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.d VXZ, X, 0, 0 + xvstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.d VXZ, X, 0, 0 + xvstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.d VXZ, X, 0, 0 + xvstelm.d VXZ, X, 1 * SIZE, 0 +#else + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 +#endif + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L223: //alpha_r != 0.0 && alpha_i == 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + + xvfmul.d x3, VXAR, x1 + xvfmul.d x4, VXAR, x2 + addi.d I, I, -1 + xvstelm.d x3, XX, 0 * SIZE, 0 + xvstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 1 + xvstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 2 + xvstelm.d x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 3 + xvstelm.d x4, XX, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + + xvfmul.s x3, VXAR, x1 + xvfmul.s x4, VXAR, x2 + addi.d I, I, -1 + xvstelm.w x3, XX, 0 * SIZE, 0 + xvstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 1 + xvstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 2 + xvstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 3 + xvstelm.w x4, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 4 + xvstelm.w x4, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 5 + xvstelm.w x4, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 6 + xvstelm.w x4, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 7 + xvstelm.w x4, XX, 1 * SIZE, 7 +#endif + add.d XX, XX, INCX + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: //alpha_r != 0.0 && alpha_i != 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + + xvfmul.d VX0, VXAI, x2 + xvfmsub.d x3, VXAR, x1, VX0 + xvfmul.d VX1, VXAI, x1 + xvfmadd.d x4, VXAR, x2, VX1 + addi.d I, I, -1 + xvstelm.d x3, XX, 0 * SIZE, 0 + xvstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 1 + xvstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 2 + xvstelm.d x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 3 + xvstelm.d x4, XX, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + + xvfmul.s VX0, VXAI, x2 + xvfmsub.s x3, VXAR, x1, VX0 + xvfmul.s VX1, VXAI, x1 + xvfmadd.s x4, VXAR, x2, VX1 + addi.d I, I, -1 + xvstelm.w x3, XX, 0 * SIZE, 0 + xvstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 1 + xvstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 2 + xvstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 3 + xvstelm.w x4, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 4 + xvstelm.w x4, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 5 + xvstelm.w x4, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 6 + xvstelm.w x4, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 7 + xvstelm.w x4, XX, 1 * SIZE, 7 +#endif + add.d XX, XX, INCX + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 3 +#else + andi I, N, 7 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MSUB s1, ALPHAR, a1, s1 + MADD s2, ALPHAR, a2, s2 + ST s1, X, 0 * SIZE + ST s2, X, 1 * SIZE + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cscal_lsx.S b/kernel/loongarch64/cscal_lsx.S new file mode 100644 index 000000000..241d3d16e --- /dev/null +++ b/kernel/loongarch64/cscal_lsx.S @@ -0,0 +1,445 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r7 +#define INCX $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXAR $vr23 +#define VXAI $vr19 +#define VXZ $vr12 +#define x1 $vr18 +#define x2 $vr17 +#define x3 $vr16 +#define x4 $vr15 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + MTG t1, ALPHAR +#ifdef DOUBLE + vreplgr2vr.d VXAR, t1 + movfr2gr.d t2, ALPHAI + vreplgr2vr.d VXAI, t2 +#else + vreplgr2vr.w VXAR, t1 + movfr2gr.s t2, ALPHAI + vreplgr2vr.w VXAI, t2 +#endif + vxor.v VXZ, VXZ, VXZ + srai.d I, N, 2 + bne INCX, TEMP, .L22 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L13 + b .L14 + .align 3 + +.L13: + bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0 + b .L113 //alpha_r != 0.0 && alpha_i == 0.0 + +.L14: + bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0 + b .L111 //alpha_r == 0.0 && alpha_i == 0.0 + .align 3 + +.L111: //alpha_r == 0.0 && alpha_i == 0.0 + vst VXZ, X, 0 * SIZE +#ifdef DOUBLE + vst VXZ, X, 2 * SIZE + vst VXZ, X, 4 * SIZE + vst VXZ, X, 6 * SIZE +#else + vst VXZ, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L113: //alpha_r != 0.0 && alpha_i == 0.0 + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXAR, x1 + vfmul.d x4, VXAR, x2 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXAR, x1 + vfmul.d x4, VXAR, x2 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 4 * SIZE + vst VX3, X, 6 * SIZE +#else + vld VX1, X, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, VXAR, x1 + vfmul.s x4, VXAR, x2 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: //alpha_r != 0.0 && alpha_i != 0.0 + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d VX0, VXAI, x2 + vfmsub.d x3, VXAR, x1, VX0 + vfmul.d VX1, VXAI, x1 + vfmadd.d x4, VXAR, x2, VX1 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d VX0, VXAI, x2 + vfmsub.d x3, VXAR, x1, VX0 + vfmul.d VX1, VXAI, x1 + vfmadd.d x4, VXAR, x2, VX1 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 4 * SIZE + vst VX3, X, 6 * SIZE +#else + vld VX1, X, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s VX0, VXAI, x2 + vfmsub.s x3, VXAR, x1, VX0 + vfmul.s VX1, VXAI, x1 + vfmadd.s x4, VXAR, x2, VX1 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move XX, X + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L23 + b .L24 + .align 3 + +.L23: + bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0 + b .L223 //alpha_r != 0.0 && alpha_i == 0.0 + +.L24: + bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0 + b .L221 //alpha_r == 0.0 && alpha_i == 0.0 + .align 3 + +.L221: //alpha_r == 0.0 && alpha_i == 0.0 +#ifdef DOUBLE + vstelm.d VXZ, X, 0, 0 + vstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.d VXZ, X, 0, 0 + vstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.d VXZ, X, 0, 0 + vstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.d VXZ, X, 0, 0 + vstelm.d VXZ, X, 1 * SIZE, 0 +#else + vstelm.w VXZ, X, 0, 0 + vstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.w VXZ, X, 0, 0 + vstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.w VXZ, X, 0, 0 + vstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.w VXZ, X, 0, 0 + vstelm.w VXZ, X, 1 * SIZE, 0 +#endif + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L223: //alpha_r != 0.0 && alpha_i == 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vfmul.d x3, VXAR, x1 + vfmul.d x4, VXAR, x2 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vfmul.d x3, VXAR, x1 + vfmul.d x4, VXAR, x2 + addi.d I, I, -1 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + + vfmul.s x3, VXAR, x1 + vfmul.s x4, VXAR, x2 + addi.d I, I, -1 + vstelm.w x3, XX, 0 * SIZE, 0 + vstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 1 + vstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 2 + vstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 3 + vstelm.w x4, XX, 1 * SIZE, 3 +#endif + add.d XX, XX, INCX + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: //alpha_r != 0.0 && alpha_i != 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vfmul.d VX0, VXAI, x2 + vfmsub.d x3, VXAR, x1, VX0 + vfmul.d VX1, VXAI, x1 + vfmadd.d x4, VXAR, x2, VX1 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vfmul.d VX0, VXAI, x2 + vfmsub.d x3, VXAR, x1, VX0 + vfmul.d VX1, VXAI, x1 + vfmadd.d x4, VXAR, x2, VX1 + addi.d I, I, -1 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + + vfmul.s VX0, VXAI, x2 + vfmsub.s x3, VXAR, x1, VX0 + vfmul.s VX1, VXAI, x1 + vfmadd.s x4, VXAR, x2, VX1 + addi.d I, I, -1 + vstelm.w x3, XX, 0 * SIZE, 0 + vstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 1 + vstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 2 + vstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 3 + vstelm.w x4, XX, 1 * SIZE, 3 +#endif + add.d XX, XX, INCX + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MSUB s1, ALPHAR, a1, s1 + MADD s2, ALPHAR, a2, s2 + ST s1, X, 0 * SIZE + ST s2, X, 1 * SIZE + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/csum_lasx.S b/kernel/loongarch64/csum_lasx.S new file mode 100644 index 000000000..3e65f2c15 --- /dev/null +++ b/kernel/loongarch64/csum_lasx.S @@ -0,0 +1,274 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define res1 $xr16 +#define res2 $xr17 + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + xvld VX2, X, 8 * SIZE + xvld VX3, X, 12 * SIZE + xvfadd.d res2, VX2, VX3 + xvfadd.d res1, res1, res2 +#else + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + xvfadd.s res2, VX0, VX1 + xvfadd.s res1, res2, res1 +#endif + addi.d X, X, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + ADD a1, a1, a2 + ADD s1, a1, s1 + + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 0 + xvinsgr2vr.w VX1, t2, 1 + xvinsgr2vr.w VX1, t3, 2 + xvinsgr2vr.w VX1, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 4 + xvinsgr2vr.w VX1, t2, 5 + xvinsgr2vr.w VX1, t3, 6 + xvinsgr2vr.w VX1, t4, 7 + xvfadd.s res2, VX0, VX1 + xvfadd.s res1, res2, res1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + ADD a1, a1, a2 + ADD s1, a1, s1 + + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/csum_lsx.S b/kernel/loongarch64/csum_lsx.S new file mode 100644 index 000000000..8de8e27ca --- /dev/null +++ b/kernel/loongarch64/csum_lsx.S @@ -0,0 +1,266 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define res1 $vr16 +#define res2 $vr17 +#define res3 $vr18 + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX2, X, 4 * SIZE + vld VX3, X, 6 * SIZE + vfadd.d res2, VX2, VX3 + vfadd.d res1, res1, res2 + vld VX0, X, 8 * SIZE + vld VX1, X, 10 * SIZE + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX2, X, 12 * SIZE + vld VX3, X, 14 * SIZE + vfadd.d res2, VX2, VX3 + vfadd.d res1, res1, res2 +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfadd.s res2, VX0, VX1 + vld VX2, X, 8 * SIZE + vld VX3, X, 12 * SIZE + vfadd.s res3, VX2, VX3 + vfadd.s res2, res3, res2 + vfadd.s res1, res1, res2 +#endif + + addi.d I, I, -1 + addi.d X, X, 16 * SIZE + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + ADD a1, a1, a2 + ADD s1, a1, s1 + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfadd.s res2, VX0, VX1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vfadd.s res3, VX2, VX3 + vfadd.s res2, res3, res2 + vfadd.s res1, res1, res2 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + ADD a1, a1, a2 + ADD s1, a1, s1 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cswap_lasx.S b/kernel/loongarch64/cswap_lasx.S new file mode 100644 index 000000000..d53773d5a --- /dev/null +++ b/kernel/loongarch64/cswap_lasx.S @@ -0,0 +1,394 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 + + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + xvst VX2, X, 0 * SIZE + xvst VX3, X, 4 * SIZE + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 4 * SIZE +#else + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvst VX2, X, 0 * SIZE + xvst VX0, Y, 0 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 +.L113: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + xvstelm.d VX0, Y, 0 * SIZE, 0 + ld.d t2, Y, 1 * SIZE + xvstelm.d VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + xvstelm.d VX0, Y, 0 * SIZE, 2 + ld.d t4, Y, 1 * SIZE + xvstelm.d VX0, Y, 1 * SIZE, 3 + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvst VX2, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + xvstelm.d VX1, Y, 0 * SIZE, 0 + ld.d t2, Y, 1 * SIZE + xvstelm.d VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + xvstelm.d VX1, Y, 0 * SIZE, 2 + ld.d t4, Y, 1 * SIZE + xvstelm.d VX1, Y, 1 * SIZE, 3 + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvst VX3, X, 4 * SIZE +#else + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 0 + ld.w t2, Y, 1 * SIZE + xvstelm.w VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 2 + ld.w t4, Y, 1 * SIZE + xvstelm.w VX0, Y, 1 * SIZE, 3 + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 4 + ld.w t2, Y, 1 * SIZE + xvstelm.w VX0, Y, 1 * SIZE, 5 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 6 + ld.w t4, Y, 1 * SIZE + xvstelm.w VX0, Y, 1 * SIZE, 7 + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvst VX2, X, 0 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L123: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + xvstelm.d VX2, X, 0 * SIZE, 0 + ld.d t2, X, 1 * SIZE + xvstelm.d VX2, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + xvstelm.d VX2, X, 0 * SIZE, 2 + ld.d t4, X, 1 * SIZE + xvstelm.d VX2, X, 1 * SIZE, 3 + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvst VX0, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + xvstelm.d VX3, X, 0 * SIZE, 0 + ld.d t2, X, 1 * SIZE + xvstelm.d VX3, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + xvstelm.d VX3, X, 0 * SIZE, 2 + ld.d t4, X, 1 * SIZE + xvstelm.d VX3, X, 1 * SIZE, 3 + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvst VX1, Y, 4 * SIZE +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + xvstelm.w VX2, X, 0 * SIZE, 0 + ld.w t2, X, 1 * SIZE + xvstelm.w VX2, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + xvstelm.w VX2, X, 0 * SIZE, 2 + ld.w t4, X, 1 * SIZE + xvstelm.w VX2, X, 1 * SIZE, 3 + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + xvstelm.w VX2, X, 0 * SIZE, 4 + ld.w t2, X, 1 * SIZE + xvstelm.w VX2, X, 1 * SIZE, 5 + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + xvstelm.w VX2, X, 0 * SIZE, 6 + ld.w t4, X, 1 * SIZE + xvstelm.w VX2, X, 1 * SIZE, 7 + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvst VX0, Y, 0 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L213: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + move XX, X + +.L222: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + ST a1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD b3, Y, 0 * SIZE + ST a3, Y, 0 * SIZE + LD b4, Y, 1 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + ST b1, XX, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + ST b2, XX, 1 * SIZE + add.d XX, XX, INCX + LD a3, X, 0 * SIZE + ST b3, XX, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST b4, XX, 1 * SIZE + add.d XX, XX, INCX + + LD b1, Y, 0 * SIZE + ST a1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD b3, Y, 0 * SIZE + ST a3, Y, 0 * SIZE + LD b4, Y, 1 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + ST b1, XX, 0 * SIZE + ST b2, XX, 1 * SIZE + add.d XX, XX, INCX + ST b3, XX, 0 * SIZE + ST b4, XX, 1 * SIZE + + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L224: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cswap_lsx.S b/kernel/loongarch64/cswap_lsx.S new file mode 100644 index 000000000..62a869066 --- /dev/null +++ b/kernel/loongarch64/cswap_lsx.S @@ -0,0 +1,421 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 + + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + vst VX2, X, 0 * SIZE + vst VX3, X, 2 * SIZE + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + vst VX2, X, 4 * SIZE + vst VX3, X, 6 * SIZE + vst VX0, Y, 4 * SIZE + vst VX1, Y, 6 * SIZE +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + vst VX2, X, 0 * SIZE + vst VX3, X, 4 * SIZE + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE +#endif + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L113: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + vstelm.d VX0, Y, 0 * SIZE, 0 + ld.d t2, Y, 1 * SIZE + vstelm.d VX0, Y, 1 * SIZE, 1 + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vst VX2, X, 0 * SIZE + vld VX1, X, 2 * SIZE + ld.d t3, Y, 0 * SIZE + vstelm.d VX1, Y, 0 * SIZE, 0 + ld.d t4, Y, 1 * SIZE + vstelm.d VX1, Y, 1 * SIZE, 1 + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vst VX3, X, 2 * SIZE + vld VX0, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + vstelm.d VX0, Y, 0 * SIZE, 0 + ld.d t2, Y, 1 * SIZE + vstelm.d VX0, Y, 1 * SIZE, 1 + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vst VX2, X, 4 * SIZE + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 * SIZE + vstelm.d VX1, Y, 0 * SIZE, 0 + ld.d t4, Y, 1 * SIZE + vstelm.d VX1, Y, 1 * SIZE, 1 + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vst VX3, X, 6 * SIZE +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + vstelm.w VX0, Y, 0 * SIZE, 0 + ld.w t2, Y, 1 * SIZE + vstelm.w VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + vstelm.w VX0, Y, 0 * SIZE, 2 + ld.w t4, Y, 1 * SIZE + vstelm.w VX0, Y, 1 * SIZE, 3 + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vst VX2, X, 0 * SIZE + + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + vstelm.w VX1, Y, 0 * SIZE, 0 + ld.w t2, Y, 1 * SIZE + vstelm.w VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + vstelm.w VX1, Y, 0 * SIZE, 2 + ld.w t4, Y, 1 * SIZE + vstelm.w VX1, Y, 1 * SIZE, 3 + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vst VX3, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L123: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + vstelm.d VX2, X, 0 * SIZE, 0 + ld.d t2, X, 1 * SIZE + vstelm.d VX2, X, 1 * SIZE, 1 + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vst VX0, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + ld.d t3, X, 0 * SIZE + vstelm.d VX3, X, 0 * SIZE, 0 + ld.d t4, X, 1 * SIZE + vstelm.d VX3, X, 1 * SIZE, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vst VX1, Y, 2 * SIZE + vld VX2, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + vstelm.d VX2, X, 0 * SIZE, 0 + ld.d t2, X, 1 * SIZE + vstelm.d VX2, X, 1 * SIZE, 1 + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vst VX0, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 * SIZE + vstelm.d VX3, X, 0 * SIZE, 0 + ld.d t4, X, 1 * SIZE + vstelm.d VX3, X, 1 * SIZE, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vst VX1, Y, 6 * SIZE +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + vstelm.w VX2, X, 0 * SIZE, 0 + ld.w t2, X, 1 * SIZE + vstelm.w VX2, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + vstelm.w VX2, X, 0 * SIZE, 2 + ld.w t4, X, 1 * SIZE + vstelm.w VX2, X, 1 * SIZE, 3 + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vst VX0, Y, 0 * SIZE + + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + vstelm.w VX3, X, 0 * SIZE, 0 + ld.w t2, X, 1 * SIZE + vstelm.w VX3, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + vstelm.w VX3, X, 0 * SIZE, 2 + ld.w t4, X, 1 * SIZE + vstelm.w VX3, X, 1 * SIZE, 3 + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + vst VX1, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L213: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + move XX, X + +.L222: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + ST a1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD b3, Y, 0 * SIZE + ST a3, Y, 0 * SIZE + LD b4, Y, 1 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + ST b1, XX, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + ST b2, XX, 1 * SIZE + add.d XX, XX, INCX + LD a3, X, 0 * SIZE + ST b3, XX, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST b4, XX, 1 * SIZE + add.d XX, XX, INCX + + LD b1, Y, 0 * SIZE + ST a1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD b3, Y, 0 * SIZE + ST a3, Y, 0 * SIZE + LD b4, Y, 1 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + ST b1, XX, 0 * SIZE + ST b2, XX, 1 * SIZE + add.d XX, XX, INCX + ST b3, XX, 0 * SIZE + ST b4, XX, 1 * SIZE + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L224: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + + addi.d I, I, -1 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/damax_lasx.S b/kernel/loongarch64/damax_lasx.S deleted file mode 100644 index c44ce4995..000000000 --- a/kernel/loongarch64/damax_lasx.S +++ /dev/null @@ -1,183 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define J $r13 -#define t1 $r14 -#define t2 $r18 -#define t3 $r15 -#define t4 $r17 -#define TEMP $r16 -#define m0 $xr8 -#define x1 $xr9 -#define x2 $xr10 -#define x3 $xr11 -#define x4 $xr12 -#define x5 $xr13 -#define x6 $xr14 -#define x7 $xr15 -#define x8 $xr16 -#define VX0 $xr20 -#define VX1 $xr21 -#define VM0 $xr22 -#define VM1 $xr23 -#define VM2 $xr18 -#define VM3 $xr19 - - PROLOGUE - - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - xvld VM0, X, 0 - srai.d I, N, 3 - bge $r0, I, .L12 - .align 3 - -.L10: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 4 * SIZE - addi.d I, I, -1 - xvfmaxa.d VM1, VX1, VX0 - addi.d X, X, 8 * SIZE - xvfmaxa.d VM0, VM0, VM1 - blt $r0, I, .L10 - .align 3 - -.L11: - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfmaxa.d VM1, x1, x2 - xvfmaxa.d VM2, x3, x4 - xvfmaxa.d VM0, VM1, VM2 - .align 3 - -.L12: //INCX==1 and N<8 - andi I, N, 7 - li.d J, 4 - bge J, I, .L13 // 4> 2)) goto L_N3 */ + srai.d J, N, 2 /* J = bn >> 2 */ + andi N, N, 0x03 + vldrepl.d VALPHA, $sp, 104 /* When N < 4, VALPHA will not changed */ + beq ZERO, J, .L_N3 + +.L_J1: /* J-- && This loop include Condition 1 */ + +/************************* Condition 1 if((N >> 2) && (M >> 3)) START !!! ************************* +* dgemm_core_16x4 */ + move C0, C + move A0, A + slli.d T0, LDC, 3 + add.d C1, C0, T0 + addi.d J, J, -1 /* J-- */ + add.d C2, C1, T0 + add.d C3, C2, T0 + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 3)) goto L_M8 */ + srai.d I, M, 3 /* I = bm >> 3 */ + beq ZERO, I, .L_M8 + +.L_I1: /* I-- */ +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + /* Calculate the first set of D0~D15, + * avoidig set 0 operation + * Load 8 * 64 from A0 + * U0 = {a1, a0} + * U1 = {a3, a2} + * U2 = {a5, a4} + * U3 = {a7, a6} + */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + preld 0, C0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + preld 0, C0, 0x20 + vfmul.d D2, U2, U4 + vfmul.d D3, U3, U4 + + vldrepl.d U5, B0, 0x08 + preld 0, C1, 0x00 + /* line 2 */ + vfmul.d D4, U0, U5 + vfmul.d D5, U1, U5 + preld 0, C1, 0x20 + vfmul.d D6, U2, U5 + vfmul.d D7, U3, U5 + + vldrepl.d U6, B0, 0x10 + preld 0, C2, 0x00 + /* line 3 */ + vfmul.d D8, U0, U6 + vfmul.d D9, U1, U6 + preld 0, C2, 0x20 + vfmul.d D10, U2, U6 + vfmul.d D11, U3, U6 + + vldrepl.d U7, B0, 0x18 + preld 0, C3, 0x00 + /* line 4 */ + vfmul.d D12, U0, U7 + vfmul.d D13, U1, U7 + preld 0, C3, 0x20 + vfmul.d D14, U2, U7 + vfmul.d D15, U3, U7 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_L7 */ + beq ZERO,TL, .L_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + vld U10, A0, 0x20 + vld U11, A0, 0x30 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + vldrepl.d U14, B0, 0x10 + vldrepl.d U15, B0, 0x18 + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + beq ZERO, TL, .L_TL1_END +.L_TL1: /* TL-- */ + KERNEL8x8x4 + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_TL1 + +.L_TL1_END: + KERNEL8x8x4_END + + /* Maybe we need calculate the last + * 7 sets of D0~D15? + */ +.L_L7: + /* if (!(L & 7)) goto L_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_L0 + +.L_L71: + /* Load 16 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + /* Cumulative D0~D15 */ + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 + + vldrepl.d U6, B0, 0x10 + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + vfmadd.d D10, U2, U6, D10 + vfmadd.d D11, U3, U6, D11 + + vldrepl.d U7, B0, 0x18 + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 + vfmadd.d D14, U2, U7, D14 + vfmadd.d D15, U3, U7, D15 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_L71 + +.L_L0: + vldrepl.d VALPHA, $sp, 104 +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D2, D2, VALPHA + vfmul.d D3, D3, VALPHA + vfmul.d D4, D4, VALPHA + vfmul.d D5, D5, VALPHA + vfmul.d D6, D6, VALPHA + vfmul.d D7, D7, VALPHA + vfmul.d D8, D8, VALPHA + vfmul.d D9, D9, VALPHA + vfmul.d D10, D10, VALPHA + vfmul.d D11, D11, VALPHA + vfmul.d D12, D12, VALPHA + vfmul.d D13, D13, VALPHA + vfmul.d D14, D14, VALPHA + vfmul.d D15, D15, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vld U2, C0, 0x20 + vld U3, C0, 0x30 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + vfmadd.d D2, D2, VALPHA, U2 + vfmadd.d D3, D3, VALPHA, U3 + + /* Load C1 */ + vld U4, C1, 0x00 + vld U5, C1, 0x10 + vld U6, C1, 0x20 + vld U7, C1, 0x30 + vfmadd.d D4, D4, VALPHA, U4 + vfmadd.d D5, D5, VALPHA, U5 + vfmadd.d D6, D6, VALPHA, U6 + vfmadd.d D7, D7, VALPHA, U7 + + /* Load C2 */ + vld U8, C2, 0x00 + vld U9, C2, 0x10 + vld U10, C2, 0x20 + vld U11, C2, 0x30 + vfmadd.d D8, D8, VALPHA, U8 + vfmadd.d D9, D9, VALPHA, U9 + vfmadd.d D10, D10, VALPHA, U10 + vfmadd.d D11, D11, VALPHA, U11 + + /* Load C3 */ + vld U0, C3, 0x00 + vld U1, C3, 0x10 + vld U2, C3, 0x20 + vld U3, C3, 0x30 + vfmadd.d D12, D12, VALPHA, U0 + vfmadd.d D13, D13, VALPHA, U1 + vfmadd.d D14, D14, VALPHA, U2 + vfmadd.d D15, D15, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + vst D2, C0, 0x20 + vst D3, C0, 0x30 + /* Store C1 */ + vst D4, C1, 0x00 + vst D5, C1, 0x10 + vst D6, C1, 0x20 + vst D7, C1, 0x30 + /* Store C2 */ + vst D8, C2, 0x00 + vst D9, C2, 0x10 + vst D10, C2, 0x20 + vst D11, C2, 0x30 + /* Store C3 */ + vst D12, C3, 0x00 + vst D13, C3, 0x10 + vst D14, C3, 0x20 + vst D15, C3, 0x30 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + addi.d C1, C1, 0x40 + addi.d C2, C2, 0x40 + addi.d C3, C3, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -8 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_I1 + +.L_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 7 + beq ZERO,I, .L_M0 + + andi I, M, 4 + beq ZERO,I, .L_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + + vldrepl.d U5, B0, 0x08 + /* line 2 */ + vfmul.d D4, U0, U5 + vfmul.d D5, U1, U5 + + vldrepl.d U6, B0, 0x10 + /* line 3 */ + vfmul.d D8, U0, U6 + vfmul.d D9, U1, U6 + + vldrepl.d U7, B0, 0x18 + /* line 4 */ + vfmul.d D12, U0, U7 + vfmul.d D13, U1, U7 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M4_L7 */ + beq ZERO,TL, .L_M4_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + vldrepl.d U14, B0, 0x10 + vldrepl.d U15, B0, 0x18 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + beq ZERO, TL, .L_M4_TL1_END + +.L_M4_TL1: /* TL-- */ + KERNEL8x4x4 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M4_TL1 + +.L_M4_TL1_END: + KERNEL8x4x4_END + +.L_M4_L7: + /* if (!(L & 7)) goto L_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M4_L0 + +.L_M4_L71: + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vldrepl.d U6, B0, 0x10 + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + + vldrepl.d U7, B0, 0x18 + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M4_L71 + +.L_M4_L0: + vldrepl.d VALPHA, $sp, 104 +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D4, D4, VALPHA + vfmul.d D5, D5, VALPHA + vfmul.d D8, D8, VALPHA + vfmul.d D9, D9, VALPHA + vfmul.d D12, D12, VALPHA + vfmul.d D13, D13, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + + /* Load C1 */ + vld U2, C1, 0x00 + vld U3, C1, 0x10 + vfmadd.d D4, D4, VALPHA, U2 + vfmadd.d D5, D5, VALPHA, U3 + + /* Load C2 */ + vld U4, C2, 0x00 + vld U5, C2, 0x10 + vfmadd.d D8, D8, VALPHA, U4 + vfmadd.d D9, D9, VALPHA, U5 + + /* Load C3 */ + vld U6, C3, 0x00 + vld U7, C3, 0x10 + vfmadd.d D12, D12, VALPHA, U6 + vfmadd.d D13, D13, VALPHA, U7 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + /* Store C1 */ + vst D4, C1, 0x00 + vst D5, C1, 0x10 + /* Store C2 */ + vst D8, C2, 0x00 + vst D9, C2, 0x10 + /* Store C3 */ + vst D12, C3, 0x00 + vst D13, C3, 0x10 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + addi.d C1, C1, 0x20 + addi.d C2, C2, 0x20 + addi.d C3, C3, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -4 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N >> 2 ) && (M & 4) ) End************/ + +.L_M2: + andi I, M, 2 + beq ZERO,I, .L_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + vldrepl.d U0, A0, 0x00 + vldrepl.d U1, A0, 0x08 + + vld U4, B0, 0x00 + vld U5, B0, 0x10 + + vfmul.d D0, U0, U4 + vfmul.d D1, U0, U5 + vfmul.d D2, U1, U4 + vfmul.d D3, U1, U5 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M2_L7 */ + beq ZERO,TL, .L_M2_L7 + + vldrepl.d U8, A0, 0x00 + vldrepl.d U9, A0, 0x08 + + addi.d TL, TL, -1 + + vld U12, B0, 0x00 + vld U13, B0, 0x10 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + beq ZERO, TL, .L_M2_TL1_END +.L_M2_TL1: /* TL-- */ + KERNEL8x2x4 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M2_TL1 +.L_M2_TL1_END: + KERNEL8x2x4_END + +.L_M2_L7: + /* if (!(L & 7)) goto L_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M2_L0 + +.L_M2_L71: + vldrepl.d U0, A0, 0x00 + vldrepl.d U1, A0, 0x08 + + vld U4, B0, 0x00 + vld U5, B0, 0x10 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 + vfmadd.d D2, U1, U4, D2 + vfmadd.d D3, U1, U5, D3 + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M2_L71 + +.L_M2_L0: + vldrepl.d VALPHA, $sp, 104 +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D2, D2, VALPHA + vfmul.d D3, D3, VALPHA + + vstelm.d D0, C0, 0x00, 0x00 + vstelm.d D0, C1, 0x00, 0x01 + vstelm.d D1, C2, 0x00, 0x00 + vstelm.d D1, C3, 0x00, 0x01 + vstelm.d D2, C0, 0x08, 0x00 + vstelm.d D2, C1, 0x08, 0x01 + vstelm.d D3, C2, 0x08, 0x00 + vstelm.d D3, C3, 0x08, 0x01 +#else + /* Load C0 */ + vld U0, C0, 0x00 + /* Load C1 */ + vld U1, C1, 0x00 + /* Load C2 */ + vld U2, C2, 0x00 + /* Load C3 */ + vld U3, C3, 0x00 + + vilvl.d D4, D2, D0 //C0 + vilvh.d D5, D2, D0 //C1 + vilvl.d D6, D3, D1 //C2 + vilvh.d D7, D3, D1 //C3 + + vfmadd.d D0, D4, VALPHA, U0 + vfmadd.d D2, D5, VALPHA, U1 + vfmadd.d D1, D6, VALPHA, U2 + vfmadd.d D3, D7, VALPHA, U3 + + vst D0, C0, 0x00 + vst D2, C1, 0x00 + vst D1, C2, 0x00 + vst D3, C3, 0x00 +#endif // #if defined(TRMMKERNEL) + + /* Add stride for C */ + addi.d C0, C0, 0x10 + addi.d C1, C1, 0x10 + addi.d C2, C2, 0x10 + addi.d C3, C3, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -2 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N >> 2 ) && (M & 2) ) End************/ + +.L_M1: + andi I, M, 1 + beq ZERO,I, .L_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + vldrepl.d U0, A0, 0x00 + vld U4, B0, 0x00 + vld U5, B0, 0x10 + vfmul.d D0, U0, U4 + vfmul.d D1, U0, U5 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M1_L7 */ + beq ZERO,TL, .L_M1_L7 + + vldrepl.d U8, A0, 0x00 + + addi.d TL, TL, -1 + vld U12, B0, 0x00 + vld U13, B0, 0x10 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + beq ZERO, TL, .L_M1_TL1_END + +.L_M1_TL1: /* TL-- */ + KERNEL8x1x4 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M1_TL1 +.L_M1_TL1_END: + KERNEL8x1x4_END + +.L_M1_L7: + /* if (!(L & 7)) goto L_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M1_L0 + +.L_M1_L71: + vldrepl.d U0, A0, 0x00 + vld U4, B0, 0x00 + vld U5, B0, 0x10 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M1_L71 + +.L_M1_L0: + vldrepl.d VALPHA, $sp, 104 +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + + vstelm.d D0, C0, 0x00, 0x00 + vstelm.d D0, C1, 0x00, 0x01 + vstelm.d D1, C2, 0x00, 0x00 + vstelm.d D1, C3, 0x00, 0x01 +#else + /* Load C0 */ + vldrepl.d U0, C0, 0x00 + vldrepl.d U1, C1, 0x00 + vilvl.d D4, U1, U0 + vfmadd.d D6, D0, VALPHA, D4 + + vldrepl.d U2, C2, 0x00 + vldrepl.d U3, C3, 0x00 + vilvl.d D5, U3, U2 + vfmadd.d D7, D1, VALPHA, D5 + + vstelm.d D6, C0, 0x00, 0x00 + vstelm.d D6, C1, 0x00, 0x01 + vstelm.d D7, C2, 0x00, 0x00 + vstelm.d D7, C3, 0x00, 0x01 +#endif // #if defined(TRMMKERNEL) + + /* Add stride for C */ + addi.d C0, C0, 0x08 + addi.d C1, C1, 0x08 + addi.d C2, C2, 0x08 + addi.d C3, C3, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -1 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N >> 2 ) && (M & 1) ) End************/ + +.L_M0: + /* Add stride for B and C + * B += (K * 32) + * C += (LDC * 32) + */ + /* since the array type is double, + * so we must mul 32 + */ + slli.d T0, K, 5 + slli.d T1, LDC, 5 + add.d B, B, T0 + add.d C, C, T1 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d OFF, OFF, 0x04 +#endif + + blt ZERO, J, .L_J1 + +//////////////// go back to L_J1 ///////////////// +///////////////////////////////////////////////// +/************************ Condition 1 if((N >> 2) && (M >> 3)) END !!! ************************/ + + vldrepl.d VALPHA, $sp, 104 + +.L_N3: + andi J, N, 2 + beq ZERO, J, .L_N1 + +/************************* Condition 2 if((N & 2) && (M >> 3)) START !!! ************************* +* dgemm_core_16x2 */ + + move C0, C + move A0, A + slli.d T0, LDC, 3 + add.d C1, C0, T0 + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 3)) goto L_N3_M8 */ + srai.d I, M, 3 /* I = bm >> 3 */ + beq ZERO, I, .L_N3_M8 + +.L_N3_I1: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 8 * 64 from A0 + * U0 = {a1, a0} + * U1 = {a3, a2} + * U2 = {a5, a4} + * U3 = {a7, a6} + */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + vfmul.d D2, U2, U4 + vfmul.d D3, U3, U4 + + vldrepl.d U5, B0, 0x08 + /* line 2 */ + vfmul.d D4, U0, U5 + vfmul.d D5, U1, U5 + vfmul.d D6, U2, U5 + vfmul.d D7, U3, U5 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_L7 */ + beq ZERO,TL, .L_N3_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + vld U10, A0, 0x20 + vld U11, A0, 0x30 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + beq ZERO, TL, .L_N3_TL1_END + +.L_N3_TL1: /* TL-- */ + KERNEL8x8x2 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_TL1 +.L_N3_TL1_END: + KERNEL8x8x2_END + +.L_N3_L7: + /* if (!(L & 7)) goto L_N3_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_L0 + +.L_N3_L71: + /* Load 16 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_L71 + +.L_N3_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D2, D2, VALPHA + vfmul.d D3, D3, VALPHA + vfmul.d D4, D4, VALPHA + vfmul.d D5, D5, VALPHA + vfmul.d D6, D6, VALPHA + vfmul.d D7, D7, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vld U2, C0, 0x20 + vld U3, C0, 0x30 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + vfmadd.d D2, D2, VALPHA, U2 + vfmadd.d D3, D3, VALPHA, U3 + + /* Load C1 */ + vld U4, C1, 0x00 + vld U5, C1, 0x10 + vld U6, C1, 0x20 + vld U7, C1, 0x30 + vfmadd.d D4, D4, VALPHA, U4 + vfmadd.d D5, D5, VALPHA, U5 + vfmadd.d D6, D6, VALPHA, U6 + vfmadd.d D7, D7, VALPHA, U7 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + vst D2, C0, 0x20 + vst D3, C0, 0x30 + /* Store C1 */ + vst D4, C1, 0x00 + vst D5, C1, 0x10 + vst D6, C1, 0x20 + vst D7, C1, 0x30 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + addi.d C1, C1, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -8 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x8 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_N3_I1 + +.L_N3_M8: + /* We have done M & 8, considering M=4/2/1 */ + andi I, M, 7 + beq ZERO,I, .L_N3_M0 + + andi I, M, 4 + beq ZERO,I, .L_N3_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + + vldrepl.d U5, B0, 0x08 + /* line 2 */ + vfmul.d D4, U0, U5 + vfmul.d D5, U1, U5 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M4_L7 */ + beq ZERO,TL, .L_N3_M4_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + beq ZERO, TL, .L_N3_M4_TL1_END + +.L_N3_M4_TL1: /* TL-- */ + KERNEL8x4x2 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M4_TL1 +.L_N3_M4_TL1_END: + KERNEL8x4x2_END + +.L_N3_M4_L7: + /* if (!(L & 7)) goto L_N3_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M4_L0 + +.L_N3_M4_L71: + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M4_L71 + +.L_N3_M4_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D4, D4, VALPHA + vfmul.d D5, D5, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + + /* Load C1 */ + vld U2, C1, 0x00 + vld U3, C1, 0x10 + vfmadd.d D4, D4, VALPHA, U2 + vfmadd.d D5, D5, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + /* Store C1 */ + vst D4, C1, 0x00 + vst D5, C1, 0x10 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + addi.d C1, C1, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -4 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 4) ) End************/ + +.L_N3_M2: + andi I, M, 2 + beq ZERO,I, .L_N3_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + vld U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + + vldrepl.d U4, B0, 0x08 + /* line 2 */ + vfmul.d D4, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M2_L7 */ + beq ZERO,TL, .L_N3_M2_L7 + + vld U8, A0, 0x00 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + beq ZERO, TL, .L_N3_M2_TL1_END + +.L_N3_M2_TL1: /* TL-- */ + KERNEL8x2x2 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M2_TL1 +.L_N3_M2_TL1_END: + KERNEL8x2x2_END + +.L_N3_M2_L7: + /* if (!(L & 7)) goto L_N3_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M2_L0 + +.L_N3_M2_L71: + vld U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + vfmadd.d D0, U0, U4, D0 + + vfmadd.d D4, U0, U5, D4 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M2_L71 + +.L_N3_M2_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D4, D4, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + vld U1, C1, 0x00 + vfmadd.d D4, D4, VALPHA, U1 +#endif // #if defined(TRMMKERNEL) + + vst D0, C0, 0x00 + vst D4, C1, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x10 + addi.d C1, C1, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -2 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 2) ) End************/ + +.L_N3_M1: + andi I, M, 1 + beq ZERO,I, .L_N3_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 1 * 64 from A0 */ + vldrepl.d U0, A0, 0x00 + + vld U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M1_L7 */ + beq ZERO,TL, .L_N3_M1_L7 + + vldrepl.d U8, A0, 0x00 + + addi.d TL, TL, -1 + + vld U12, B0, 0x00 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + beq ZERO, TL, .L_N3_M1_TL1_END + +.L_N3_M1_TL1: /* TL-- */ + KERNEL8x1x2 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M1_TL1 +.L_N3_M1_TL1_END: + KERNEL8x1x2_END + +.L_N3_M1_L7: + /* if (!(L & 7)) goto L_N3_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M1_L0 + +.L_N3_M1_L71: + vldrepl.d U0, A0, 0x00 + + vld U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M1_L71 + +.L_N3_M1_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C1, 0x00 + vilvl.d U2, U1, U0 + vfmadd.d D0, D0, VALPHA, U2 +#endif // #if defined(TRMMKERNEL) + + vstelm.d D0, C0, 0x00, 0x00 + vstelm.d D0, C1, 0x00, 0x01 + + /* Add stride for C */ + addi.d C0, C0, 0x08 + addi.d C1, C1, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -1 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 1) ) End************/ + +.L_N3_M0: + /* Add stride for B and C + * B += (K * 16) + * C += (LDC * 16) + */ + /* since the array type is double, + * so we must mul 16 + */ + slli.d T0, K, 4 + slli.d T1, LDC, 4 + add.d B, B, T0 + add.d C, C, T1 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d OFF, OFF, 0x02 +#endif + + /* We must reinit I */ + srai.d I, M, 4 /* I = bm >> 4 */ + +/************************* Condition 2 if((N & 2) && (M >> 3)) End !!! ************************* +* dgemm_core_16x2 */ + +.L_N1: + andi J, N, 1 + beq ZERO, J, .L_N0 + +/************************* Condition 3 if((N & 1) && (M >> 3)) START !!! ************************* +* dgemm_core_16x1 */ + + move C0, C + move A0, A + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 3)) goto L_N1_M8 */ + srai.d I, M, 3 /* I = bm >> 3 */ + beq ZERO, I, .L_N1_M8 + +.L_N1_I1: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 8 * 64 from A0 + * U0 = {a3, a2} + * U1 = {a1, a0} + * U2 = {a5, a4} + * U3 = {a7, a6} + */ + + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + vfmul.d D2, U2, U4 + vfmul.d D3, U3, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_L7 */ + beq ZERO,TL, .L_N1_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + vld U10, A0, 0x20 + vld U11, A0, 0x30 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_TL1_END +.L_N1_TL1: /* TL-- */ + KERNEL8x8x1 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_TL1 +.L_N1_TL1_END: + KERNEL8x8x1_END + +.L_N1_L7: + /* if (!(L & 7)) goto L_N1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_L0 + +.L_N1_L71: + /* Load 16 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_L71 + +.L_N1_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D2, D2, VALPHA + vfmul.d D3, D3, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vld U2, C0, 0x20 + vld U3, C0, 0x30 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + vfmadd.d D2, D2, VALPHA, U2 + vfmadd.d D3, D3, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + vst D2, C0, 0x20 + vst D3, C0, 0x30 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -8 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x8 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_N1_I1 + +.L_N1_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 7 + beq ZERO,I, .L_N1_M0 + + andi I, M, 4 + beq ZERO,I, .L_N1_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M4_L7 */ + beq ZERO,TL, .L_N1_M4_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_M4_TL1_END + +.L_N1_M4_TL1: /* TL-- */ + KERNEL8x4x1 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M4_TL1 +.L_N1_M4_TL1_END: + KERNEL8x4x1_END + +.L_N1_M4_L7: + /* if (!(L & 7)) goto L_N1_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M4_L0 + +.L_N1_M4_L71: + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M4_L71 + +.L_N1_M4_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -4 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 1) && (M & 4) ) End************/ + +.L_N1_M2: + andi I, M, 2 + beq ZERO,I, .L_N1_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + vld U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M2_L7 */ + beq ZERO,TL, .L_N1_M2_L7 + + vld U8, A0, 0x00 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_M2_TL1_END + +.L_N1_M2_TL1: /* TL-- */ + KERNEL8x2x1 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M2_TL1 +.L_N1_M2_TL1_END: + KERNEL8x2x1_END + +.L_N1_M2_L7: + /* if (!(L & 7)) goto L_N1_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M2_L0 + +.L_N1_M2_L71: + vld U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M2_L71 + +.L_N1_M2_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ +#endif // #if defined(TRMMKERNEL) + + vstelm.d D0, C0, 0x00, 0x00 + vstelm.d D0, C0, 0x08, 0x01 + + /* Add stride for C */ + addi.d C0, C0, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -2 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 1 ) && (M & 2) ) End************/ + +.L_N1_M1: + andi I, M, 1 + beq ZERO,I, .L_N1_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 1 * 64 from A0 */ + vldrepl.d U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M1_L7 */ + beq ZERO,TL, .L_N1_M1_L7 + + vldrepl.d U8, A0, 0x00 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_M1_TL1_END + +.L_N1_M1_TL1: /* TL-- */ + KERNEL8x1x1 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M1_TL1 +.L_N1_M1_TL1_END: + KERNEL8x1x1_END + +.L_N1_M1_L7: + /* if (!(L & 7)) goto L_N1_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M1_L0 + +.L_N1_M1_L71: + vldrepl.d U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M1_L71 + +.L_N1_M1_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + vldrepl.d U0, C0, 0x00 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ +#endif // #if defined(TRMMKERNEL) + + vstelm.d D0, C0, 0x00, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -1 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 1 ) && (M & 1) ) End************/ + +.L_N1_M0: + +/************************* Condition 3 if((N & 1) && (M >> 3)) End !!! ************************* +* dgemm_core_16x1 */ + +.L_N0: + /* Restore regs */ + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LD $f24, $sp, 40 + LD $f25, $sp, 48 + LD $f26, $sp, 56 + LD $f27, $sp, 64 + LD $f28, $sp, 72 + LD $f29, $sp, 80 + LD $f30, $sp, 88 + LD $f31, $sp, 96 + addi.d $sp, $sp, 112 + + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/dgemm_ncopy_4_lsx.S b/kernel/loongarch64/dgemm_ncopy_4_lsx.S new file mode 100644 index 000000000..048a49af6 --- /dev/null +++ b/kernel/loongarch64/dgemm_ncopy_4_lsx.S @@ -0,0 +1,185 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r21 +#define TL $r7 +#define T0 $r6 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 + + PROLOGUE + + move TD, DST + move TS, SRC + slli.d TL, LDA, 0x03 + slli.d T0, TL, 0x01 + srai.d J, N, 0x02 + beq J, ZERO, .L_N2 +.L_J1: /* J-- */ + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x02 + add.d S3, S2, TL + add.d S4, S2, T0 + add.d TS, S3, T0 + addi.d J, J, -1 + beq I, ZERO, .L_I3 +.L_I1: /* I-- */ + GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00 + GINTERLACE v, d, D0, D2, U1, U0 + GINTERLACE v, d, D1, D3, U3, U2 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 + addi.d TD, TD, 0x40 + + GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10 + GINTERLACE v, d, D0, D2, U1, U0 + GINTERLACE v, d, D1, D3, U3, U2 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 +.L_I3: + andi I, M, 0x03 + beq I, ZERO, .L_I0 +.L_II1: + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + fst.d F2, TD, 0x10 + addi.d S3, S3, 0x08 + fst.d F3, TD, 0x18 + addi.d S4, S4, 0x08 + + addi.d TD, TD, 0x20 + addi.d I, I, -1 + blt ZERO, I, .L_II1 +.L_I0: + blt ZERO, J, .L_J1 +.L_N2: + andi J, N, 0x02 + beq ZERO, J, .L_N1 + + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x01 + add.d TS, S2, TL + beq I, ZERO, .L_2I3 +.L_2I1: /* I-- */ + GLD v, , U0, S1, 0x00, U1, S2, 0x00 + GINTERLACE v, d, D0, D1, U1, U0 + GST v, , D0, TD, 0x00, D1, TD, 0x10 + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_2I1 +.L_2I3: + andi I, M, 0x01 + beq ZERO, I, .L_N1 +.L_2II1: /* I-- */ + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fst.d F0, TD, 0x00 + addi.d I, I, -1 + fst.d F1, TD, 0x08 + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 + blt ZERO, I, .L_2II1 +.L_N1: + move S1, TS + beq ZERO, M, .L_N0 +.L_M1: + fld.d F0, S1, 0x00 + addi.d S1, S1, 0x08 + fst.d F0, TD, 0x00 + addi.d TD, TD, 0x08 + addi.d M, M, -1 + blt ZERO, M, .L_M1 +.L_N0: + jirl $r0, $r1, 0x00 + EPILOGUE diff --git a/kernel/loongarch64/dgemm_ncopy_8_lsx.S b/kernel/loongarch64/dgemm_ncopy_8_lsx.S new file mode 100644 index 000000000..30bebe8df --- /dev/null +++ b/kernel/loongarch64/dgemm_ncopy_8_lsx.S @@ -0,0 +1,283 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r21 +#define TL $r7 +#define T0 $r6 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 + + PROLOGUE + push_if_used 26, 32 + move TD, DST + move TS, SRC + slli.d TL, LDA, 0x03 + slli.d T0, TL, 0x01 + srai.d J, N, 0x03 + beq J, ZERO, .L_N4 +.L_J1: + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x03 + add.d S3, S2, TL + addi.d J, J, -1 + add.d S4, S3, TL + add.d S5, S3, T0 + add.d S6, S4, T0 + add.d S7, S5, T0 + add.d S8, S6, T0 + add.d TS, S7, T0 + beq I, ZERO, .L_I7 +.L_I1: + GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00, \ + U4, S5, 0x00, U5, S6, 0x00, U6, S7, 0x00, U7, S8, 0x00 + GINTERLACE v, d, D0, D4, U1, U0 + GINTERLACE v, d, D1, D5, U3, U2 + GINTERLACE v, d, D2, D6, U5, U4 + GINTERLACE v, d, D3, D7, U7, U6 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ + D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 + addi.d TD, TD, 0x80 + GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10, \ + U4, S5, 0x10, U5, S6, 0x10, U6, S7, 0x10, U7, S8, 0x10 + GINTERLACE v, d, D0, D4, U1, U0 + GINTERLACE v, d, D1, D5, U3, U2 + GINTERLACE v, d, D2, D6, U5, U4 + GINTERLACE v, d, D3, D7, U7, U6 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ + D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 + addi.d TD, TD, 0x80 + GLD v, , U0, S1, 0x20, U1, S2, 0x20, U2, S3, 0x20, U3, S4, 0x20, \ + U4, S5, 0x20, U5, S6, 0x20, U6, S7, 0x20, U7, S8, 0x20 + GINTERLACE v, d, D0, D4, U1, U0 + GINTERLACE v, d, D1, D5, U3, U2 + GINTERLACE v, d, D2, D6, U5, U4 + GINTERLACE v, d, D3, D7, U7, U6 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ + D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 + addi.d TD, TD, 0x80 + GLD v, , U0, S1, 0x30, U1, S2, 0x30, U2, S3, 0x30, U3, S4, 0x30, \ + U4, S5, 0x30, U5, S6, 0x30, U6, S7, 0x30, U7, S8, 0x30 + GINTERLACE v, d, D0, D4, U1, U0 + GINTERLACE v, d, D1, D5, U3, U2 + GINTERLACE v, d, D2, D6, U5, U4 + GINTERLACE v, d, D3, D7, U7, U6 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ + D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 + addi.d TD, TD, 0x80 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d S5, S5, 0x40 + addi.d S6, S6, 0x40 + addi.d S7, S7, 0x40 + addi.d S8, S8, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 +.L_I7: + andi I, M, 0x07 + beq I, ZERO, .L_I0 +.L_II1: /* I-- */ + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + fld.d F4, S5, 0x00 + fld.d F5, S6, 0x00 + fld.d F6, S7, 0x00 + fld.d F7, S8, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + fst.d F2, TD, 0x10 + addi.d S3, S3, 0x08 + fst.d F3, TD, 0x18 + addi.d S4, S4, 0x08 + fst.d F4, TD, 0x20 + addi.d S5, S5, 0x08 + fst.d F5, TD, 0x28 + addi.d S6, S6, 0x08 + fst.d F6, TD, 0x30 + addi.d S7, S7, 0x08 + fst.d F7, TD, 0x38 + addi.d S8, S8, 0x08 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_II1 +.L_I0: + blt ZERO, J, .L_J1 +.L_N4: + andi J, N, 0x04 + beq ZERO, J, .L_N2 + + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x02 + add.d S3, S2, TL + add.d S4, S2, T0 + add.d TS, S3, T0 + beq I, ZERO, .L_I3 +.L_4I1: /* I-- */ + GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00 + GINTERLACE v, d, D0, D2, U1, U0 + GINTERLACE v, d, D1, D3, U3, U2 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 + addi.d TD, TD, 0x40 + + GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10 + GINTERLACE v, d, D0, D2, U1, U0 + GINTERLACE v, d, D1, D3, U3, U2 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_4I1 +.L_I3: + andi I, M, 0x03 + beq I, ZERO, .L_N2 +.L_4II1: + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + fst.d F2, TD, 0x10 + addi.d S3, S3, 0x08 + fst.d F3, TD, 0x18 + addi.d S4, S4, 0x08 + + addi.d TD, TD, 0x20 + addi.d I, I, -1 + blt ZERO, I, .L_4II1 +.L_N2: + andi J, N, 0x02 + beq ZERO, J, .L_N1 + + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x01 + add.d TS, S2, TL + beq I, ZERO, .L_NI1 +.L_2I1: /* I-- */ + GLD v, , U0, S1, 0x00, U1, S2, 0x00 + GINTERLACE v, d, D0, D1, U1, U0 + GST v, , D0, TD, 0x00, D1, TD, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_2I1 +.L_NI1: + andi I, M, 0x01 + beq I, ZERO, .L_N1 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 +.L_N1: + move S1, TS + beq ZERO, M, .L_N0 +.L_M1: + fld.d F0, S1, 0x00 + addi.d S1, S1, 0x08 + fst.d F0, TD, 0x00 + addi.d TD, TD, 0x08 + addi.d M, M, -1 + blt ZERO, M, .L_M1 +.L_N0: + pop_if_used 26, 32 + jirl $r0, $r1, 0x00 + EPILOGUE diff --git a/kernel/loongarch64/dgemm_tcopy_4_lsx.S b/kernel/loongarch64/dgemm_tcopy_4_lsx.S new file mode 100644 index 000000000..134066471 --- /dev/null +++ b/kernel/loongarch64/dgemm_tcopy_4_lsx.S @@ -0,0 +1,280 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define P0 $r16 +#define P1 $r17 +#define P2 $r18 +#define P3 $r19 +#define T0 $r20 +#define T1 $r23 +#define TL $r7 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 + + PROLOGUE + push_if_used 18, 8 + + move S0, SRC + move P0, DST + + // Find P0, P2, P3 + srai.d T0, N, 0x02 + slli.d T0, T0, 0x02 + srai.d T1, N, 0x01 + slli.d T1, T1, 0x01 + mul.d T0, M, T0 + mul.d T1, M, T1 + slli.d T0, T0, 0x03 + slli.d T1, T1, 0x03 + add.d P2, DST, T0 + add.d P3, DST, T1 + + slli.d TL, LDA, 0x03 + srai.d J, M, 0x02 + slli.d T0, TL, 0x01 + slli.d T1, M, 0x05 + beq ZERO, J, .L_M3 +.L_J1: /* J-- */ + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S0, S3, T0 + + move P1, P0 + addi.d P0, P0, 0x80 + + srai.d I, N, 0x02 + addi.d J, J, -1 + beq ZERO, I, .L_N3 +.L_I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + vld U4, S3, 0x00 + vld U5, S3, 0x10 + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + vst U4, P1, 0x40 + vst U5, P1, 0x50 + vst U6, P1, 0x60 + vst U7, P1, 0x70 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + add.d P1, P1, T1 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + vst U2, P2, 0x20 + vst U3, P2, 0x30 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d P2, P2, 0x40 +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, P3, 0x00 + fst.d F1, P3, 0x08 + fst.d F2, P3, 0x10 + fst.d F3, P3, 0x18 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d P3, P3, 0x20 + +.L_N0: + blt ZERO, J, .L_J1 + +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 + + move S1, S0 + add.d S2, S0, TL + add.d S0, S0, T0 + + move P1, P0 + addi.d P0, P0, 0x40 + + srai.d I, N, 0x02 + beq ZERO, I, .L_2N3 + +.L_2I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d I, I, -1 + add.d P1, P1, T1 + + blt ZERO, I, .L_2I1 + +.L_2N3: + andi I, N, 0x02 + beq ZERO, I, .L_2N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d P2, P2, 0x20 + +.L_2N1: + addi.d I, N, 0x01 + beq ZERO, I, .L_M1 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, P3, 0x00 + fst.d F1, P3, 0x08 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d P3, P3, 0x10 +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + move P1, P0 + + srai.d I, N, 0x02 + beq ZERO, I, .L_1N3 + +.L_1I1: + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + + addi.d S1, S1, 0x20 + addi.d I, I, -1 + add.d P1, P1, T1 + + blt ZERO, I, .L_1I1 + +.L_1N3: + andi I, N, 0x02 + beq I, ZERO, .L_1N1 + + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + + fst.d F0, P2, 0x00 + fst.d F1, P2, 0x08 + + addi.d S1, S1, 0x10 + addi.d P2, P2, 0x10 + +.L_1N1: + andi I, N, 0x01 + beq I, ZERO, .L_M0 + + fld.d F0, S1, 0x00 + + fst.d F0, P3, 0x00 + +.L_M0: + pop_if_used 18, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE diff --git a/kernel/loongarch64/dgemm_tcopy_8_lsx.S b/kernel/loongarch64/dgemm_tcopy_8_lsx.S new file mode 100644 index 000000000..a7e3ef69c --- /dev/null +++ b/kernel/loongarch64/dgemm_tcopy_8_lsx.S @@ -0,0 +1,597 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define P0 $r20 +#define P1 $r23 +#define P2 $r24 +#define P3 $r25 +#define P4 $r26 +#define P5 $r27 +#define T0 $r28 +#define T1 $r29 +#define TL $r7 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LASX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 + + PROLOGUE + push_if_used 24, 8 + + move S0, SRC + move P0, DST + + srai.d T0, N, 0x03 + srai.d T1, N, 0x02 + slli.d T0, T0, 0x03 + slli.d T1, T1, 0x02 + mul.d P2, M, T0 + mul.d P3, M, T1 + slli.d P2, P2, 0x03 + slli.d P3, P3, 0x03 + add.d P2, DST, P2 + add.d P3, DST, P3 + + srai.d T0, N, 0x01 + slli.d T0, T0, 0x01 + mul.d P4, M, T0 + slli.d P4, P4, 0x03 + add.d P4, DST, P4 + + slli.d TL, LDA, 0x03 + srai.d J, M, 0x03 + slli.d T0, TL, 0x01 + slli.d T1, M, 0x06 + beq ZERO, J, .L_M7 +.L_J1: /* J-- */ + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S5, S3, T0 + add.d S6, S4, T0 + add.d S7, S5, T0 + add.d S8, S6, T0 + add.d S0, S7, T0 + + move P1, P0 + addi.d P0, P0, 0x200 + + srai.d I, N, 0x03 + addi.d J, J, -1 + beq ZERO, I, .L_N7 + +.L_I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + vst U4, P1, 0x40 + vst U5, P1, 0x50 + vst U6, P1, 0x60 + vst U7, P1, 0x70 + + vld U0, S3, 0x00 + vld U1, S3, 0x10 + vld U2, S3, 0x20 + vld U3, S3, 0x30 + vld U4, S4, 0x00 + vld U5, S4, 0x10 + vld U6, S4, 0x20 + vld U7, S4, 0x30 + + vst U0, P1, 0x80 + vst U1, P1, 0x90 + vst U2, P1, 0xa0 + vst U3, P1, 0xb0 + vst U4, P1, 0xc0 + vst U5, P1, 0xd0 + vst U6, P1, 0xe0 + vst U7, P1, 0xf0 + + vld U0, S5, 0x00 + vld U1, S5, 0x10 + vld U2, S5, 0x20 + vld U3, S5, 0x30 + vld U4, S6, 0x00 + vld U5, S6, 0x10 + vld U6, S6, 0x20 + vld U7, S6, 0x30 + + vst U0, P1, 0x100 + vst U1, P1, 0x110 + vst U2, P1, 0x120 + vst U3, P1, 0x130 + vst U4, P1, 0x140 + vst U5, P1, 0x150 + vst U6, P1, 0x160 + vst U7, P1, 0x170 + + vld U0, S7, 0x00 + vld U1, S7, 0x10 + vld U2, S7, 0x20 + vld U3, S7, 0x30 + vld U4, S8, 0x00 + vld U5, S8, 0x10 + vld U6, S8, 0x20 + vld U7, S8, 0x30 + + vst U0, P1, 0x180 + vst U1, P1, 0x190 + vst U2, P1, 0x1a0 + vst U3, P1, 0x1b0 + vst U4, P1, 0x1c0 + vst U5, P1, 0x1d0 + vst U6, P1, 0x1e0 + vst U7, P1, 0x1f0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d S5, S5, 0x40 + addi.d S6, S6, 0x40 + addi.d S7, S7, 0x40 + addi.d S8, S8, 0x40 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_I1 +.L_N7: + andi I, N, 0x04 + beq ZERO, I, .L_N3 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + vld U4, S3, 0x00 + vld U5, S3, 0x10 + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + vst U2, P2, 0x20 + vst U3, P2, 0x30 + vst U4, P2, 0x40 + vst U5, P2, 0x50 + vst U6, P2, 0x60 + vst U7, P2, 0x70 + + vld U0, S5, 0x00 + vld U1, S5, 0x10 + vld U2, S6, 0x00 + vld U3, S6, 0x10 + vld U4, S7, 0x00 + vld U5, S7, 0x10 + vld U6, S8, 0x00 + vld U7, S8, 0x10 + + vst U0, P2, 0x80 + vst U1, P2, 0x90 + vst U2, P2, 0xa0 + vst U3, P2, 0xb0 + vst U4, P2, 0xc0 + vst U5, P2, 0xd0 + vst U6, P2, 0xe0 + vst U7, P2, 0xf0 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d S5, S5, 0x20 + addi.d S6, S6, 0x20 + addi.d S7, S7, 0x20 + addi.d S8, S8, 0x20 + addi.d P2, P2, 0x100 + +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + vld U4, S5, 0x00 + vld U5, S6, 0x00 + vld U6, S7, 0x00 + vld U7, S8, 0x00 + + vst U0, P3, 0x00 + vst U1, P3, 0x10 + vst U2, P3, 0x20 + vst U3, P3, 0x30 + vst U4, P3, 0x40 + vst U5, P3, 0x50 + vst U6, P3, 0x60 + vst U7, P3, 0x70 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d S5, S5, 0x10 + addi.d S6, S6, 0x10 + addi.d S7, S7, 0x10 + addi.d S8, S8, 0x10 + addi.d P3, P3, 0x80 + +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + fld.d F4, S5, 0x00 + fld.d F5, S6, 0x00 + fld.d F6, S7, 0x00 + fld.d F7, S8, 0x00 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + fst.d F2, P4, 0x10 + fst.d F3, P4, 0x18 + fst.d F4, P4, 0x20 + fst.d F5, P4, 0x28 + + fst.d F6, P4, 0x30 + fst.d F7, P4, 0x38 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d S5, S5, 0x08 + addi.d S6, S6, 0x08 + addi.d S7, S7, 0x08 + addi.d S8, S8, 0x08 + addi.d P4, P4, 0x40 + +.L_N0: + blt ZERO, J, .L_J1 +.L_M7: + andi J, M, 0x04 + beq ZERO, J, .L_M3 + + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S0, S3, T0 + + move P1, P0 + addi.d P0, P0, 0x100 + + srai.d I, N, 0x03 + beq ZERO, I, .L_4N7 +.L_4I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + vst U4, P1, 0x40 + vst U5, P1, 0x50 + vst U6, P1, 0x60 + vst U7, P1, 0x70 + + vld U0, S3, 0x00 + vld U1, S3, 0x10 + vld U2, S3, 0x20 + vld U3, S3, 0x30 + vld U4, S4, 0x00 + vld U5, S4, 0x10 + vld U6, S4, 0x20 + vld U7, S4, 0x30 + + vst U0, P1, 0x80 + vst U1, P1, 0x90 + vst U2, P1, 0xa0 + vst U3, P1, 0xb0 + vst U4, P1, 0xc0 + vst U5, P1, 0xd0 + vst U6, P1, 0xe0 + vst U7, P1, 0xf0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_4I1 +.L_4N7: + andi I, N, 0x04 + beq ZERO, I, .L_4N3 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + vld U4, S3, 0x00 + vld U5, S3, 0x10 + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + vst U2, P2, 0x20 + vst U3, P2, 0x30 + vst U4, P2, 0x40 + vst U5, P2, 0x50 + vst U6, P2, 0x60 + vst U7, P2, 0x70 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d P2, P2, 0x80 + +.L_4N3: + andi I, N, 0x02 + beq ZERO, I, .L_4N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vst U0, P3, 0x00 + vst U1, P3, 0x10 + vst U2, P3, 0x20 + vst U3, P3, 0x30 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d P3, P3, 0x40 + +.L_4N1: + andi I, N, 0x01 + beq ZERO, I, .L_M3 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + fst.d F2, P4, 0x10 + fst.d F3, P4, 0x18 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d P4, P4, 0x20 +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 + + move S1, S0 + add.d S2, S0, TL + add.d S0, S0, T0 + + move P1, P0 + addi.d P0, P0, 0x80 + + srai.d I, N, 0x03 + beq ZERO, I, .L_2N7 +.L_2I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + vst U4, P1, 0x40 + vst U5, P1, 0x50 + vst U6, P1, 0x60 + vst U7, P1, 0x70 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_2I1 +.L_2N7: + andi I, N, 0x04 + beq ZERO, I, .L_2N3 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + vst U2, P2, 0x20 + vst U3, P2, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d P2, P2, 0x40 + +.L_2N3: + andi I, N, 0x02 + beq ZERO, I, .L_2N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, P3, 0x00 + vst U1, P3, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d P3, P3, 0x20 + +.L_2N1: + andi I, N, 0x01 + beq ZERO, I, .L_M1 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d P4, P4, 0x10 +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + add.d S2, S0, TL + + move P1, P0 + addi.d P0, P0, 0x40 + + srai.d I, N, 0x03 + beq ZERO, I, .L_1N7 +.L_1I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + + addi.d S1, S1, 0x40 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_1I1 + +.L_1N7: + andi I, N, 0x04 + beq ZERO, I, .L_1N3 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + + addi.d S1, S1, 0x20 + addi.d P2, P2, 0x20 + +.L_1N3: + andi I, N, 0x02 + beq ZERO, I, .L_1N1 + + vld U0, S1, 0x00 + vst U0, P3, 0x00 + + addi.d S1, S1, 0x10 + addi.d P3, P3, 0x10 + +.L_1N1: + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.d F0, S1, 0x00 + + fst.d F0, P4, 0x00 + + addi.d S1, S1, 0x08 + addi.d P4, P4, 0x08 +.L_M0: + pop_if_used 24, 8 + jirl $r0, $r1, 0x00 + EPILOGUE diff --git a/kernel/loongarch64/dmax_lasx.S b/kernel/loongarch64/dmax_lasx.S deleted file mode 100644 index 46366d2ec..000000000 --- a/kernel/loongarch64/dmax_lasx.S +++ /dev/null @@ -1,175 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define J $r13 -#define t1 $r14 -#define t2 $r18 -#define t3 $r15 -#define t4 $r17 -#define TEMP $r16 -#define m0 $xr8 -#define x1 $xr9 -#define x2 $xr10 -#define x3 $xr11 -#define x4 $xr12 -#define VX0 $xr20 -#define VX1 $xr21 -#define VM0 $xr22 -#define VM1 $xr23 -#define VM2 $xr19 - - PROLOGUE - - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - xvld VM0, X, 0 - srai.d I, N, 3 - bge $r0, I, .L12 - .align 3 - -.L10: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 4 * SIZE - addi.d I, I, -1 - xvfmax.d VM1, VX1, VX0 - addi.d X, X, 8 * SIZE - xvfmax.d VM0, VM0, VM1 - blt $r0, I, .L10 - .align 3 - -.L11: - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfmax.d VM1, x1, x2 - xvfmax.d VM2, x3, x4 - xvfmax.d VM0, VM1, VM2 - .align 3 - -.L12: //INCX==1 and N<8 - andi I, N, 7 - li.d J, 4 - bge J, I, .L13 // 4 0) I-- */ + move S1, TS //a_offset1 + add.d S2, TS, TL //a_offset2 + srai.d J, M, 0x02 + add.d TS, TS, T0 + + beq J, ZERO, .L_I3 + +.L_I1: /* if (j > 0) J-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvand.v D0, U0, U0 + xvand.v D1, U1, U1 + xvand.v D2, U2, U2 + xvand.v D3, U3, U3 + + xvpermi.q D0, U2, 0x02 + xvpermi.q D2, U0, 0x31 + xvpermi.q D1, U3, 0x02 + xvpermi.q D3, U1, 0x31 + + xvst D0, TD, 0x00 + xvst D2, TD, 0x20 + xvst D1, TD, 0x40 + xvst D3, TD, 0x60 + + addi.d S1, S1, 0x40 // a_offset1 + addi.d S2, S2, 0x40 + addi.d TD, TD, 0x80 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_I1 + +.L_I3: + andi J, M, 0x03 + beq J, ZERO, .L_II20 + +.L_II1: /* j = (m & 3) if (j > 0) */ + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, TD, 0x00 + vst $vr1, TD, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 + + addi.d J, J, -1 + blt ZERO, J, .L_II1 + +.L_II20: + addi.d I, I, -1 + blt ZERO, I, .L_J1 + +.L_N0: /* if(n&1)*/ + andi I, N, 0x01 + beq ZERO, I, .L_N00 + +.L_N1: + srai.d J, M, 0x02 + beq ZERO, J, .L_N10 + +.L_N11: /* j = (m >> 2) if (j > 0) */ + xvld U0, TS, 0x00 + xvld U1, TS, 0x20 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + + addi.d TS, TS, 0x40 // a_offset + addi.d TD, TD, 0x40 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N11 + +.L_N10: + andi J, M, 0x03 + beq J, ZERO, .L_N00 + +.L_N12: /* j = (m & 3) if (j > 0) */ + vld $vr0, TS, 0x00 + vst $vr0, TD, 0x00 + + + addi.d TS, TS, 0x10 // a_offset + addi.d TD, TD, 0x10 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N12 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_ncopy_4_lasx.S b/kernel/loongarch64/zgemm_ncopy_4_lasx.S new file mode 100644 index 000000000..5d874bcc1 --- /dev/null +++ b/kernel/loongarch64/zgemm_ncopy_4_lasx.S @@ -0,0 +1,320 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define TD $r20 +#define TS $r11 +#define TL $r19 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 +#define D8 $xr16 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TD, DST //boffset + move TS, SRC //aoffset + + slli.d TL, LDA, 0x03 + slli.d TL, TL, 0x01 + + srai.d J, N, 0x02 + beq J, ZERO, .L_N0 + +.L_J1: /* J-- */ + move S1, TS + add.d S2, S1, TL + add.d S3, S2, TL + add.d S4, S3, TL + + slli.d T0, TL, 0x02 + add.d TS, TS, T0 + + srai.d I, M, 0x02 + beq I, ZERO, .L_I3 + +.L_I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + xvld U4, S3, 0x00 + xvld U5, S3, 0x20 + xvld U6, S4, 0x00 + xvld U7, S4, 0x20 + + xvand.v D0, U0, U0 + xvand.v D1, U1, U1 + xvand.v D2, U2, U2 + xvand.v D3, U3, U3 + xvand.v D4, U4, U4 + xvand.v D5, U5, U5 + xvand.v D6, U6, U6 + xvand.v D7, U7, U7 + + xvpermi.q D0, U2, 0x02 + xvpermi.q D4, U6, 0x02 + xvpermi.q D2, U0, 0x31 + xvpermi.q D6, U4, 0x31 + xvpermi.q D1, U3, 0x02 + xvpermi.q D5, U7, 0x02 + xvpermi.q D3, U1, 0x31 + xvpermi.q D7, U5, 0x31 + + xvst D0, TD, 0x00 + xvst D4, TD, 0x20 + xvst D2, TD, 0x40 + xvst D6, TD, 0x60 + xvst D1, TD, 0x80 + xvst D5, TD, 0xa0 + xvst D3, TD, 0xc0 + xvst D7, TD, 0xe0 + + addi.d S1, S1, 0x40 // a_offset + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d TD, TD, 0x100 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_I1 + +.L_I3: + andi I, M, 0x02 + beq I, ZERO, .L_II20 + +.L_II1: /* if(m&2) */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvand.v D0, U0, U0 + xvand.v D1, U1, U1 + xvand.v D2, U2, U2 + xvand.v D3, U3, U3 + + xvpermi.q D0, U1, 0x02 + xvpermi.q D2, U3, 0x02 + xvpermi.q D1, U0, 0x31 + xvpermi.q D3, U2, 0x31 + + xvst D0, TD, 0x00 + xvst D2, TD, 0x20 + xvst D1, TD, 0x40 + xvst D3, TD, 0x60 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d TD, TD, 0x80 + +.L_II20: + andi I, M, 0x01 + beq I, ZERO, .L_J0 + +.L_II2: /* if(m&1) */ + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + vld $vr2, S3, 0x00 + vld $vr3, S4, 0x00 + + vst $vr0, TD, 0x00 + vst $vr1, TD, 0x10 + vst $vr2, TD, 0x20 + vst $vr3, TD, 0x30 + + addi.d TD, TD, 0x40 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N0: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_N20 + + move S1, TS + add.d S2, S1, TL + + slli.d T0, TL, 0x01 + add.d TS, TS, T0 + + srai.d I, M, 0x02 + beq ZERO, I, .L_N10 + +.L_N11: /* if(i>0) */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvand.v D0, U0, U0 + xvand.v D1, U1, U1 + xvand.v D2, U2, U2 + xvand.v D3, U3, U3 + + xvpermi.q D0, U2, 0x02 + xvpermi.q D2, U0, 0x31 + xvpermi.q D1, U3, 0x02 + xvpermi.q D3, U1, 0x31 + + xvst D0, TD, 0x00 + xvst D2, TD, 0x20 + xvst D1, TD, 0x40 + xvst D3, TD, 0x60 + + addi.d S1, S1, 0x40 // a_offset + addi.d S2, S2, 0x40 + addi.d TD, TD, 0x80 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N10: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N130 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvand.v D0, U0, U0 + + xvpermi.q D0, U1, 0x02 + xvpermi.q U1, U0, 0x31 + + xvst D0, TD, 0x00 + xvst U1, TD, 0x20 + + addi.d S1, S1, 0x20 // a_offset + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + +.L_N130: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N20 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, TD, 0x00 + vst $vr1, TD, 0x10 + + addi.d TD, TD, 0x20 + +.L_N20: /* if(n&1) */ + andi I, N, 0x01 + beq I, ZERO, .L_N00 + + move S1, TS + srai.d I, M, 0x02 + + beq I, ZERO, .L_N30 + +.L_N21: /* if(i>0) */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + + addi.d S1, S1, 0x40 // aoffset1 + addi.d TD, TD, 0x40 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N30: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N330 + + xvld U0, S1, 0x00 + xvst U0, TD, 0x00 + + addi.d S1, S1, 0x20 // aoffset1 + addi.d TD, TD, 0x20 // b_offset + +.L_N330: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N00 + + vld $vr0, S1, 0x00 + vst $vr0, TD, 0x00 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_ncopy_4_lsx.S b/kernel/loongarch64/zgemm_ncopy_4_lsx.S new file mode 100644 index 000000000..203471cbd --- /dev/null +++ b/kernel/loongarch64/zgemm_ncopy_4_lsx.S @@ -0,0 +1,332 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define TD $r20 +#define TS $r11 +#define TL $r19 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TD, DST //boffset + move TS, SRC //aoffset + + slli.d TL, LDA, 0x03 + slli.d TL, TL, 0x01 + + srai.d J, N, 0x02 + beq J, ZERO, .L_N0 + +.L_J1: /* J-- */ + move S1, TS + add.d S2, S1, TL + add.d S3, S2, TL + add.d S4, S3, TL + + slli.d T0, TL, 0x02 + add.d TS, TS, T0 + + srai.d I, M, 0x02 + beq I, ZERO, .L_I3 + +.L_I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vld U8, S3, 0x00 + vld U9, S3, 0x10 + vld U10, S3, 0x20 + vld U11, S3, 0x30 + + vld U12, S4, 0x00 + vld U13, S4, 0x10 + vld U14, S4, 0x20 + vld U15, S4, 0x30 + + vst U0, TD, 0x00 + vst U4, TD, 0x10 + vst U8, TD, 0x20 + vst U12, TD, 0x30 + + vst U1, TD, 0x40 + vst U5, TD, 0x50 + vst U9, TD, 0x60 + vst U13, TD, 0x70 + + vst U2, TD, 0x80 + vst U6, TD, 0x90 + vst U10, TD, 0xa0 + vst U14, TD, 0xb0 + + vst U3, TD, 0xc0 + vst U7, TD, 0xd0 + vst U11, TD, 0xe0 + vst U15, TD, 0xf0 + + addi.d S1, S1, 0x40 // a_offset + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d TD, TD, 0x100 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_I1 + +.L_I3: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_II20 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vld U4, S3, 0x00 + vld U5, S3, 0x10 + + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, TD, 0x00 + vst U2, TD, 0x10 + vst U4, TD, 0x20 + vst U6, TD, 0x30 + + vst U1, TD, 0x40 + vst U3, TD, 0x50 + vst U5, TD, 0x60 + vst U7, TD, 0x70 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d TD, TD, 0x80 + +.L_II20: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_J0 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + vst U2, TD, 0x20 + vst U3, TD, 0x30 + + addi.d TD, TD, 0x40 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N0: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_N20 + + move S1, TS + add.d S2, S1, TL + + slli.d T0, TL, 0x01 + add.d TS, TS, T0 + + srai.d I, M, 0x02 + beq ZERO, I, .L_N10 + +.L_N11: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, TD, 0x00 + vst U4, TD, 0x10 + vst U1, TD, 0x20 + vst U5, TD, 0x30 + + vst U2, TD, 0x40 + vst U6, TD, 0x50 + vst U3, TD, 0x60 + vst U7, TD, 0x70 + + addi.d S1, S1, 0x40 // a_offset + addi.d S2, S2, 0x40 + addi.d TD, TD, 0x80 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N10: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N130 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, TD, 0x00 + vst U2, TD, 0x10 + vst U1, TD, 0x20 + vst U3, TD, 0x30 + + addi.d S1, S1, 0x20 // a_offset + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + +.L_N130: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N20 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d TD, TD, 0x20 + +.L_N20: /* if(n&1) */ + andi I, N, 0x01 + beq I, ZERO, .L_N00 + + move S1, TS + srai.d I, M, 0x02 + + beq I, ZERO, .L_N30 + +.L_N21: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + vst U2, TD, 0x20 + vst U3, TD, 0x30 + + addi.d S1, S1, 0x40 // aoffset1 + addi.d TD, TD, 0x40 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N30: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N330 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d S1, S1, 0x20 // aoffset1 + addi.d TD, TD, 0x20 // b_offset + +.L_N330: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N00 + + vld U0, S1, 0x00 + + vst U0, TD, 0x00 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_ncopy_8_lasx.S b/kernel/loongarch64/zgemm_ncopy_8_lasx.S new file mode 100644 index 000000000..7cd8f605b --- /dev/null +++ b/kernel/loongarch64/zgemm_ncopy_8_lasx.S @@ -0,0 +1,263 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 +#define D8 $xr16 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TD, DST //boffset + move TS, SRC //aoffset + + slli.d TL, LDA, 0x03 //lda + slli.d TL, TL, 0x01 + + slli.d T0, TL, 0x03 + srai.d J, N, 0x03 //j + + beq J, ZERO, .L_N1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS + add.d S2, TS, TL + move I, M + add.d S3, S2, TL + add.d S4, S3, TL + add.d S5, S4, TL + add.d S6, S5, TL + add.d S7, S6, TL + add.d S8, S7, TL + add.d TS, TS, T0 + + beq I, ZERO, .L_J11 + +.L_I1: /* if(i>0) i--*/ + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + fld.d F2, S2, 0x00 + fld.d F3, S2, 0x08 + fld.d F4, S3, 0x00 + fld.d F5, S3, 0x08 + fld.d F6, S4, 0x00 + fld.d F7, S4, 0x08 + + fst.d F0, TD, 0x00 + fst.d F1, TD, 0x08 + fst.d F2, TD, 0x10 + fst.d F3, TD, 0x18 + fst.d F4, TD, 0x20 + fst.d F5, TD, 0x28 + fst.d F6, TD, 0x30 + fst.d F7, TD, 0x38 + + fld.d F0, S5, 0x00 + fld.d F1, S5, 0x08 + fld.d F2, S6, 0x00 + fld.d F3, S6, 0x08 + fld.d F4, S7, 0x00 + fld.d F5, S7, 0x08 + fld.d F6, S8, 0x00 + fld.d F7, S8, 0x08 + + fst.d F0, TD, 0x40 + fst.d F1, TD, 0x48 + fst.d F2, TD, 0x50 + fst.d F3, TD, 0x58 + fst.d F4, TD, 0x60 + fst.d F5, TD, 0x68 + fst.d F6, TD, 0x70 + fst.d F7, TD, 0x78 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d S5, S5, 0x10 + addi.d S6, S6, 0x10 + addi.d S7, S7, 0x10 + addi.d S8, S8, 0x10 + addi.d TD, TD, 0x80 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 + +.L_J11: /* j--*/ + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N1: /* if(n&4)*/ + andi I, N, 0x04 + beq I, ZERO, .L_N2 + + move S1, TS + add.d S2, TS, TL + move I, M + add.d S3, S2, TL + add.d S4, S3, TL + add.d TS, S4, TL + + beq I, ZERO, .L_N2 + +.L_N11: /* if(i>0)*/ + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + fld.d F2, S2, 0x00 + fld.d F3, S2, 0x08 + fld.d F4, S3, 0x00 + fld.d F5, S3, 0x08 + fld.d F6, S4, 0x00 + fld.d F7, S4, 0x08 + + fst.d F0, TD, 0x00 + fst.d F1, TD, 0x08 + fst.d F2, TD, 0x10 + fst.d F3, TD, 0x18 + fst.d F4, TD, 0x20 + fst.d F5, TD, 0x28 + fst.d F6, TD, 0x30 + fst.d F7, TD, 0x38 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N2: /* if(n&2)*/ + andi I, N, 0x02 + beq I, ZERO, .L_N3 + + move S1, TS + add.d S2, TS, TL + move I, M + add.d TS, S2, TL + + beq I, ZERO, .L_N3 + +.L_N21: /* if(i>0)*/ + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + fld.d F2, S2, 0x00 + fld.d F3, S2, 0x08 + + fst.d F0, TD, 0x00 + fst.d F1, TD, 0x08 + fst.d F2, TD, 0x10 + fst.d F3, TD, 0x18 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N3: /* if(n&2)*/ + andi I, N, 0x01 + beq I, ZERO, .L_N0 + + move S1, TS + move I, M + + beq I, ZERO, .L_N0 + +.L_N31: /* if(i>0)*/ + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + + fst.d F0, TD, 0x00 + fst.d F1, TD, 0x08 + + addi.d S1, S1, 0x10 + addi.d TD, TD, 0x10 + + addi.d I, I, -1 + blt ZERO, I, .L_N31 + +.L_N0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_tcopy_2_lasx.S b/kernel/loongarch64/zgemm_tcopy_2_lasx.S new file mode 100644 index 000000000..3fe17beef --- /dev/null +++ b/kernel/loongarch64/zgemm_tcopy_2_lasx.S @@ -0,0 +1,212 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r7 +#define T0 $r18 +#define S8 $r19 +#define S9 $r20 +#define S10 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x03 //lda + slli.d TL, TL, 0x01 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x03 + add.d S9, DST, T0 //boffset2 + + srai.d J, M, 0x01 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 + add.d S2, S1, TL //aoffset2 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x40 + + srai.d I, N, 0x02 + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, S8, 0x00 + xvst U2, S8, 0x20 + + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + xvst U1, S8, 0x00 + xvst U3, S8, 0x20 + + add.d S8, S8, T0 + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, S8, 0x00 + xvst U1, S8, 0x20 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, S9, 0x00 + vst $vr1, S9, 0x10 + + addi.d S9, S9, 0x20 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + xvld U0, TS, 0x00 + xvld U1, TS, 0x20 + + xvst U0, TD, 0x00 + + slli.d T0, M, 0x05 + add.d TD, TD, T0 + + xvst U1, TD, 0x00 + + add.d TD, TD, T0 + addi.d TS, TS, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + xvld U0, TS, 0x00 + + xvst U0, TD, 0x00 + + addi.d TS, TS, 0x20 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + vld $vr0, TS, 0x00 + + vst $vr0, S9, 0x00 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_tcopy_4_lasx.S b/kernel/loongarch64/zgemm_tcopy_4_lasx.S new file mode 100644 index 000000000..1adee11c5 --- /dev/null +++ b/kernel/loongarch64/zgemm_tcopy_4_lasx.S @@ -0,0 +1,302 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r18 +#define T0 $r19 +#define S8 $r20 +#define S9 $r23 +#define S10 $r11 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x03 //lda + slli.d TL, TL, 0x01 //lda + + ori T0, ZERO, 0x03 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x03 + add.d S9, DST, T0 //boffset2 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x03 + add.d S10, DST, T0 //boffset3 + + srai.d J, M, 0x02 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + add.d S2, S1, TL + add.d S3, S2, TL + add.d S4, S3, TL + + slli.d T0, TL, 0x02 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x100 + + srai.d I, N, 0x02 + + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + xvld U4, S3, 0x00 + xvld U5, S3, 0x20 + xvld U6, S4, 0x00 + xvld U7, S4, 0x20 + + xvst U0, S8, 0x00 + xvst U1, S8, 0x20 + xvst U2, S8, 0x40 + xvst U3, S8, 0x60 + xvst U4, S8, 0x80 + xvst U5, S8, 0xa0 + xvst U6, S8, 0xc0 + xvst U7, S8, 0xe0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvst U0, S9, 0x00 + xvst U1, S9, 0x20 + xvst U2, S9, 0x40 + xvst U3, S9, 0x60 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d S9, S9, 0x80 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + vld $vr2, S3, 0x00 + vld $vr3, S4, 0x00 + + vst $vr0, S10, 0x00 + vst $vr1, S10, 0x10 + vst $vr2, S10, 0x20 + vst $vr3, S10, 0x30 + + addi.d S10, S10, 0x40 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&2) */ + andi I, M, 0x02 + beq ZERO, I, .L_M2 + + move S1, TS //aoffset1 + add.d S2, S1, TL + + slli.d T0, TL, 0x01 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x80 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, S8, 0x00 + xvst U1, S8, 0x20 + xvst U2, S8, 0x40 + xvst U3, S8, 0x60 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, S9, 0x00 + xvst U1, S9, 0x20 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S9, S9, 0x40 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M2 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, S10, 0x00 + vst $vr1, S10, 0x10 + + addi.d S10, S10, 0x20 + +.L_M2: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + move S1, TS //aoffset1 + move S8, TD //boffset1 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M2N1 + +.L_M2I1: /* if(i>0) */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + + xvst U0, S8, 0x00 + xvst U1, S8, 0x20 + + addi.d S1, S1, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M2I1 + +.L_M2N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M2N2 + + xvld U0, S1, 0x00 + + xvst U0, S9, 0x00 + + addi.d S1, S1, 0x20 + +.L_M2N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + vld $vr0, S1, 0x00 + + vst $vr0, S10, 0x00 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_tcopy_4_lsx.S b/kernel/loongarch64/zgemm_tcopy_4_lsx.S new file mode 100644 index 000000000..954753eaf --- /dev/null +++ b/kernel/loongarch64/zgemm_tcopy_4_lsx.S @@ -0,0 +1,355 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r18 +#define T0 $r19 +#define S8 $r20 +#define S9 $r23 +#define S10 $r11 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x03 //lda + slli.d TL, TL, 0x01 //lda + + ori T0, ZERO, 0x03 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x03 + add.d S9, DST, T0 //boffset2 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x03 + add.d S10, DST, T0 //boffset3 + + srai.d J, M, 0x02 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + add.d S2, S1, TL + add.d S3, S2, TL + add.d S4, S3, TL + + slli.d T0, TL, 0x02 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x100 + + srai.d I, N, 0x02 + + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vld U8, S3, 0x00 + vld U9, S3, 0x10 + vld U10, S3, 0x20 + vld U11, S3, 0x30 + + vld U12, S4, 0x00 + vld U13, S4, 0x10 + vld U14, S4, 0x20 + vld U15, S4, 0x30 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + vst U2, S8, 0x20 + vst U3, S8, 0x30 + vst U4, S8, 0x40 + vst U5, S8, 0x50 + vst U6, S8, 0x60 + vst U7, S8, 0x70 + + vst U8, S8, 0x80 + vst U9, S8, 0x90 + vst U10, S8, 0xa0 + vst U11, S8, 0xb0 + vst U12, S8, 0xc0 + vst U13, S8, 0xd0 + vst U14, S8, 0xe0 + vst U15, S8, 0xf0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vld U4, S3, 0x00 + vld U5, S3, 0x10 + + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, S9, 0x00 + vst U1, S9, 0x10 + vst U2, S9, 0x20 + vst U3, S9, 0x30 + + vst U4, S9, 0x40 + vst U5, S9, 0x50 + vst U6, S9, 0x60 + vst U7, S9, 0x70 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d S9, S9, 0x80 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vst U0, S10, 0x00 + vst U1, S10, 0x10 + vst U2, S10, 0x20 + vst U3, S10, 0x30 + + addi.d S10, S10, 0x40 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&2) */ + andi I, M, 0x02 + beq ZERO, I, .L_M2 + + move S1, TS //aoffset1 + add.d S2, S1, TL + + slli.d T0, TL, 0x01 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x80 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + vst U2, S8, 0x20 + vst U3, S8, 0x30 + + vst U4, S8, 0x40 + vst U5, S8, 0x50 + vst U6, S8, 0x60 + vst U7, S8, 0x70 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, S9, 0x00 + vst U1, S9, 0x10 + vst U2, S9, 0x20 + vst U3, S9, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S9, S9, 0x40 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M2 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, S10, 0x00 + vst U1, S10, 0x10 + + addi.d S10, S10, 0x20 + +.L_M2: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + move S1, TS //aoffset1 + move S8, TD //boffset1 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M2N1 + +.L_M2I1: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + vst U2, S8, 0x20 + vst U3, S8, 0x30 + + addi.d S1, S1, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M2I1 + +.L_M2N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M2N2 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, S9, 0x00 + vst U1, S9, 0x10 + + addi.d S1, S1, 0x20 + +.L_M2N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + vld U0, S1, 0x00 + + vst U0, S10, 0x00 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_tcopy_8_lasx.S b/kernel/loongarch64/zgemm_tcopy_8_lasx.S new file mode 100644 index 000000000..f7440dc24 --- /dev/null +++ b/kernel/loongarch64/zgemm_tcopy_8_lasx.S @@ -0,0 +1,268 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x03 //lda + slli.d TL, TL, 0x01 + + srai.d J, N, 0x03 //j + + beq J, ZERO, .L_N1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x80 + + srai.d I, M, 0x01 + beq ZERO, I, .L_J1M1 + +.L_J1I1: /* if(i>0) i--*/ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + xvld U4, S2, 0x00 + xvld U5, S2, 0x20 + xvld U6, S2, 0x40 + xvld U7, S2, 0x60 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + xvst U2, TD, 0x40 + xvst U3, TD, 0x60 + xvst U4, TD, 0x80 + xvst U5, TD, 0xa0 + xvst U6, TD, 0xc0 + xvst U7, TD, 0xe0 + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x100 + + addi.d I, I, -1 + blt ZERO, I, .L_J1I1 + +.L_J1M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_J0 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + xvst U2, TD, 0x40 + xvst U3, TD, 0x60 + + addi.d TD, TD, 0x80 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N1: /* if(n&4) */ + andi I, N, 0x04 + beq ZERO, I, .L_N2 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x40 + + srai.d I, M, 0x01 + beq ZERO, I, .L_N1M1 + +.L_N1I1: /* if(i>0) i-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + xvst U2, TD, 0x40 + xvst U3, TD, 0x60 + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x80 + + addi.d I, I, -1 + blt ZERO, I, .L_N1I1 + +.L_N1M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N2 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + + addi.d TD, TD, 0x40 + +.L_N2: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_N3 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x20 + + srai.d I, M, 0x01 + beq ZERO, I, .L_N2M1 + +.L_N2I1: /* if(i>0) i-- */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + + add.d S1, S1, T0 + add.d S2, S2, T0 + + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_N2I1 + +.L_N2M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N3 + + xvld U0, S1, 0x00 + + xvst U0, TD, 0x00 + + addi.d TD, TD, 0x20 + +.L_N3: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + + srai.d I, M, 0x01 + beq ZERO, I, .L_N3M1 + +.L_N3I1: /* if(i>0) i-- */ + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, TD, 0x00 + vst $vr1, TD, 0x10 + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_N3I1 + +.L_N3M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N0 + + vld $vr0, S1, 0x00 + + vst $vr0, TD, 0x00 + +.L_N0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/znrm2_lasx.S b/kernel/loongarch64/znrm2_lasx.S new file mode 100644 index 000000000..53f8a6e05 --- /dev/null +++ b/kernel/loongarch64/znrm2_lasx.S @@ -0,0 +1,252 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define XX $r19 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define INF $f23 +#define a1 $f22 +#define max $f17 +#define ALPHA $f12 +#define a3 $f15 +#define a2 $f16 +#define VX0 $xr15 +#define VX1 $xr16 +#define VM0 $xr17 +#define VM1 $xr18 +#define VM2 $xr13 +#define VM3 $xr14 +#define res1 $xr19 +#define res2 $xr20 +#define VALPHA $xr21 + + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v VM0, VM0, VM0 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + move XX, X + // Init INF + addi.d TEMP, $r0, 0x7FF + slli.d TEMP, TEMP, 52 + MTC INF, TEMP + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L97 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmaxa.d VM1, VX1, VX0 + xvfmaxa.d VM0, VM0, VM1 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + b .L96 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L97 + .align 3 + +.L21: + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM0, VM0, VM1 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L96 + .align 3 + +.L96: + xvpickve.d VX0, VM0, 1 + xvpickve.d VX1, VM0, 2 + xvpickve.d VM3, VM0, 3 + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM2, VM3, VM0 + xvfmaxa.d VM0, VM1, VM2 + .align 3 + +.L97: + andi I, N, 3 + bge $r0, I, .L99 + .align 3 + +.L98: + fld.d a3, X, 0 * SIZE + fld.d a2, X, 1 * SIZE + fmaxa.d a3, a2, a3 + fmaxa.d max, a3, max + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L98 + .align 3 + +.L99: + fabs.d max, max + lu12i.w TEMP, 0x3f800 // 1 + movgr2fr.d a1, $r0 + movgr2fr.w ALPHA, TEMP + CMPEQ $fcc0, max, a1 + fcvt.d.s ALPHA, ALPHA + bcnez $fcc0, .L999 + fdiv.d ALPHA, ALPHA, max + CMPEQ $fcc0, INF, ALPHA + bcnez $fcc0, .L999 + movfr2gr.d TEMP, ALPHA + xvreplgr2vr.d VALPHA, TEMP + +.L100: + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L120 + bge $r0, I, .L997 + .align 3 + +.L110: + xvld VX0, XX, 0 * SIZE + xvld VX1, XX, 4 * SIZE + xvfmul.d VM2, VX0, VALPHA + xvfmul.d VM3, VX1, VALPHA + xvfmadd.d res1, VM2, VM2, res1 + xvfmadd.d res2, VM3, VM3, res2 + addi.d XX, XX, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L110 + b .L996 + .align 3 + +.L120: + bge $r0, I, .L997 + .align 3 + +.L121: + ld.d t1, XX, 0 * SIZE + ld.d t2, XX, 1 * SIZE + add.d XX, XX, INCX + ld.d t3, XX, 0 * SIZE + ld.d t4, XX, 1 * SIZE + add.d XX, XX, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, XX, 0 * SIZE + ld.d t2, XX, 1 * SIZE + add.d XX, XX, INCX + ld.d t3, XX, 0 * SIZE + ld.d t4, XX, 1 * SIZE + add.d XX, XX, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VM2, VX0, VALPHA + xvfmul.d VM3, VX1, VALPHA + xvfmadd.d res1, VM2, VM2, res1 + xvfmadd.d res2, VM3, VM3, res2 + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L996: + xvfadd.d res1, res1, res2 + xvpickve.d VX0, res1, 1 + xvpickve.d VX1, res1, 2 + xvpickve.d VM2, res1, 3 + xvfadd.d res1, VX0, res1 + xvfadd.d VX1, VX1, VM2 + xvfadd.d res1, VX1, res1 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.d a3, XX, 0 * SIZE + fld.d a2, XX, 1 * SIZE + addi.d I, I, -1 + fmul.d a3, a3, ALPHA + fmadd.d $f19, a3, a3, $f19 + fmul.d a2, a2, ALPHA + fmadd.d $f19, a2, a2, $f19 + add.d XX, XX , INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d $f19, $f19 + fmul.d $f0, max, $f19 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/znrm2_lsx.S b/kernel/loongarch64/znrm2_lsx.S new file mode 100644 index 000000000..14c59d504 --- /dev/null +++ b/kernel/loongarch64/znrm2_lsx.S @@ -0,0 +1,260 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define XX $r19 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define INF $f23 +#define a1 $f22 +#define max $f17 +#define ALPHA $f12 +#define a3 $f15 +#define a2 $f16 +#define VX0 $vr15 +#define VX1 $vr16 +#define VM0 $vr17 +#define VM1 $vr18 +#define VM2 $vr13 +#define VM3 $vr14 +#define res1 $vr19 +#define res2 $vr20 +#define VALPHA $vr21 + + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v VM0, VM0, VM0 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + move XX, X + // Init INF + addi.d TEMP, $r0, 0x7FF + slli.d TEMP, TEMP, 52 + MTC INF, TEMP + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L97 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmaxa.d VM1, VX1, VX0 + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vfmaxa.d VM2, VX1, VX0 + vfmaxa.d VM3, VM1, VM2 + vfmaxa.d VM0, VM0, VM3 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + b .L96 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L97 + .align 3 + +.L21: + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmaxa.d VM1, VX0, VX1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM2, VX0, VX1 + vfmaxa.d VM3, VM1, VM2 + vfmaxa.d VM0, VM0, VM3 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L96 + .align 3 + +.L96: + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + vfmaxa.d VM0, VX0, VX1 + .align 3 + +.L97: + andi I, N, 3 + bge $r0, I, .L99 + .align 3 + +.L98: + fld.d a3, X, 0 * SIZE + fld.d a2, X, 1 * SIZE + fmaxa.d a3, a2, a3 + fmaxa.d max, a3, max + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L98 + .align 3 + +.L99: + fabs.d max, max + lu12i.w TEMP, 0x3f800 // 1 + movgr2fr.d a1, $r0 + movgr2fr.w ALPHA, TEMP + CMPEQ $fcc0, max, a1 + fcvt.d.s ALPHA, ALPHA + bcnez $fcc0, .L999 + fdiv.d ALPHA, ALPHA, max + CMPEQ $fcc0, INF, ALPHA + bcnez $fcc0, .L999 + movfr2gr.d TEMP, ALPHA + vreplgr2vr.d VALPHA, TEMP + +.L100: + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L120 + bge $r0, I, .L997 + .align 3 + +.L110: + vld VX0, XX, 0 * SIZE + vld VX1, XX, 2 * SIZE + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + vld VX0, XX, 4 * SIZE + vld VX1, XX, 6 * SIZE + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + addi.d XX, XX, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L110 + b .L996 + .align 3 + +.L120: + bge $r0, I, .L997 + .align 3 + +.L121: + ld.d t1, XX, 0 * SIZE + ld.d t2, XX, 1 * SIZE + add.d XX, XX, INCX + ld.d t3, XX, 0 * SIZE + ld.d t4, XX, 1 * SIZE + add.d XX, XX, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmul.d VM2, VX0, VALPHA + ld.d t1, XX, 0 * SIZE + vfmul.d VM3, VX1, VALPHA + ld.d t2, XX, 1 * SIZE + add.d XX, XX, INCX + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + ld.d t3, XX, 0 * SIZE + ld.d t4, XX, 1 * SIZE + add.d XX, XX, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L996: + vfadd.d res1, res1, res2 + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.d a3, XX, 0 * SIZE + fld.d a2, XX, 1 * SIZE + addi.d I, I, -1 + fmul.d a3, a3, ALPHA + fmadd.d $f19, a3, a3, $f19 + fmul.d a2, a2, ALPHA + fmadd.d $f19, a2, a2, $f19 + add.d XX, XX , INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d $f19, $f19 + fmul.d $f0, max, $f19 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600 index 9a6e06d67..c37b88adb 100644 --- a/kernel/mips/KERNEL.P5600 +++ b/kernel/mips/KERNEL.P5600 @@ -35,7 +35,7 @@ DSUMKERNEL = ../mips/sum.c CSUMKERNEL = ../mips/zsum.c ZSUMKERNEL = ../mips/zsum.c -ifdef HAVE_MSA +ifndef NO_MSA SASUMKERNEL = ../mips/sasum_msa.c DASUMKERNEL = ../mips/dasum_msa.c CASUMKERNEL = ../mips/casum_msa.c @@ -47,7 +47,7 @@ CASUMKERNEL = ../mips/zasum.c ZASUMKERNEL = ../mips/zasum.c endif -ifdef HAVE_MSA +ifndef NO_MSA SAXPYKERNEL = ../mips/saxpy_msa.c DAXPYKERNEL = ../mips/daxpy_msa.c CAXPYKERNEL = ../mips/caxpy_msa.c @@ -59,7 +59,7 @@ CAXPYKERNEL = ../mips/zaxpy.c ZAXPYKERNEL = ../mips/zaxpy.c endif -ifdef HAVE_MSA +ifndef NO_MSA SCOPYKERNEL = ../mips/scopy_msa.c DCOPYKERNEL = ../mips/dcopy_msa.c CCOPYKERNEL = ../mips/ccopy_msa.c @@ -71,7 +71,7 @@ CCOPYKERNEL = ../mips/zcopy.c ZCOPYKERNEL = ../mips/zcopy.c endif -ifdef HAVE_MSA +ifndef NO_MSA SDOTKERNEL = ../mips/sdot_msa.c DDOTKERNEL = ../mips/ddot_msa.c CDOTKERNEL = ../mips/cdot_msa.c @@ -88,7 +88,7 @@ DNRM2KERNEL = ../mips/nrm2.c CNRM2KERNEL = ../mips/znrm2.c ZNRM2KERNEL = ../mips/znrm2.c -ifdef HAVE_MSA +ifndef NO_MSA SROTKERNEL = ../mips/srot_msa.c DROTKERNEL = ../mips/drot_msa.c CROTKERNEL = ../mips/crot_msa.c @@ -100,11 +100,13 @@ CROTKERNEL = ../mips/zrot.c ZROTKERNEL = ../mips/zrot.c endif -ifdef HAVE_MSA +ifndef NO_MSA SSCALKERNEL = ../mips/sscal_msa.c DSCALKERNEL = ../mips/dscal_msa.c -CSCALKERNEL = ../mips/cscal_msa.c -ZSCALKERNEL = ../mips/zscal_msa.c +#CSCALKERNEL = ../mips/cscal_msa.c +#ZSCALKERNEL = ../mips/zscal_msa.c +CSCALKERNEL = ../mips/zscal.c +ZSCALKERNEL = ../mips/zscal.c else SSCALKERNEL = ../mips/scal.c DSCALKERNEL = ../mips/scal.c @@ -112,7 +114,7 @@ CSCALKERNEL = ../mips/zscal.c ZSCALKERNEL = ../mips/zscal.c endif -ifdef HAVE_MSA +ifndef NO_MSA SSWAPKERNEL = ../mips/sswap_msa.c DSWAPKERNEL = ../mips/dswap_msa.c CSWAPKERNEL = ../mips/cswap_msa.c @@ -124,7 +126,7 @@ CSWAPKERNEL = ../mips/zswap.c ZSWAPKERNEL = ../mips/zswap.c endif -ifdef HAVE_MSA +ifndef NO_MSA SGEMVNKERNEL = ../mips/sgemv_n_msa.c DGEMVNKERNEL = ../mips/dgemv_n_msa.c CGEMVNKERNEL = ../mips/cgemv_n_msa.c @@ -136,7 +138,7 @@ CGEMVNKERNEL = ../mips/zgemv_n.c ZGEMVNKERNEL = ../mips/zgemv_n.c endif -ifdef HAVE_MSA +ifndef NO_MSA SGEMVTKERNEL = ../mips/sgemv_t_msa.c DGEMVTKERNEL = ../mips/dgemv_t_msa.c CGEMVTKERNEL = ../mips/cgemv_t_msa.c @@ -148,7 +150,7 @@ CGEMVTKERNEL = ../mips/zgemv_t.c ZGEMVTKERNEL = ../mips/zgemv_t.c endif -ifdef HAVE_MSA +ifndef NO_MSA SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c @@ -162,7 +164,7 @@ SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o endif -ifdef HAVE_MSA +ifndef NO_MSA DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c @@ -180,7 +182,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o endif -ifdef HAVE_MSA +ifndef NO_MSA CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c @@ -198,7 +200,7 @@ CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o endif -ifdef HAVE_MSA +ifndef NO_MSA ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c @@ -212,7 +214,7 @@ ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o endif -ifdef HAVE_MSA +ifndef NO_MSA STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c @@ -224,7 +226,7 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif -ifdef HAVE_MSA +ifndef NO_MSA DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c @@ -236,7 +238,7 @@ DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif -ifdef HAVE_MSA +ifndef NO_MSA CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c @@ -248,7 +250,7 @@ CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif -ifdef HAVE_MSA +ifndef NO_MSA ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c diff --git a/kernel/mips/zscal.c b/kernel/mips/zscal.c index bca1155c1..7bb261941 100644 --- a/kernel/mips/zscal.c +++ b/kernel/mips/zscal.c @@ -47,6 +47,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F else { temp = - da_i * x[ip+1] ; + if (isnan(x[ip]) || isinf(x[ip])) temp = NAN; x[ip+1] = da_i * x[ip] ; } } @@ -63,8 +64,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; } } - x[ip] = temp; - + if ( da_r != da_r ) + x[ip] = da_r; + else + x[ip] = temp; + ip += inc_x2; } diff --git a/kernel/mips64/KERNEL.LOONGSON3R4 b/kernel/mips64/KERNEL.LOONGSON3R4 index b81e5441d..1149d97f1 100644 --- a/kernel/mips64/KERNEL.LOONGSON3R4 +++ b/kernel/mips64/KERNEL.LOONGSON3R4 @@ -1,4 +1,4 @@ -ifdef HAVE_MSA +ifndef NO_MSA SAXPYKERNEL = ../mips/saxpy_msa.c DAXPYKERNEL = ../mips/daxpy_msa.c CAXPYKERNEL = ../mips/caxpy_msa.c @@ -8,14 +8,14 @@ SAXPYKERNEL = axpy_loongson3a.S DAXPYKERNEL = daxpy_loongson3a_simd.S endif -ifdef HAVE_MSA +ifndef NO_MSA SCOPYKERNEL = ../mips/scopy_msa.c DCOPYKERNEL = ../mips/dcopy_msa.c CCOPYKERNEL = ../mips/ccopy_msa.c ZCOPYKERNEL = ../mips/zcopy_msa.c endif -ifdef HAVE_MSA +ifndef NO_MSA SDOTKERNEL = ../mips/sdot_msa.c DDOTKERNEL = ../mips/ddot_msa.c CDOTKERNEL = ../mips/cdot_msa.c @@ -23,21 +23,21 @@ ZDOTKERNEL = ../mips/zdot_msa.c endif DSDOTKERNEL = ../mips/dot.c -ifdef HAVE_MSA +ifndef NO_MSA SROTKERNEL = ../mips/srot_msa.c DROTKERNEL = ../mips/drot_msa.c CROTKERNEL = ../mips/crot_msa.c ZROTKERNEL = ../mips/zrot_msa.c endif -ifdef HAVE_MSA +ifndef NO_MSA SSCALKERNEL = ../mips/sscal_msa.c DSCALKERNEL = ../mips/dscal_msa.c CSCALKERNEL = ../mips/cscal_msa.c ZSCALKERNEL = ../mips/zscal_msa.c endif -ifdef HAVE_MSA +ifndef NO_MSA SGEMVNKERNEL = ../mips/sgemv_n_msa.c DGEMVNKERNEL = ../mips/dgemv_n_msa.c SGEMVTKERNEL = ../mips/sgemv_t_msa.c @@ -57,21 +57,21 @@ ZGEMVNKERNEL = zgemv_n_loongson3a.c ZGEMVTKERNEL = zgemv_t_loongson3a.c endif -ifdef HAVE_MSA +ifndef NO_MSA SASUMKERNEL = ../mips/sasum_msa.c DASUMKERNEL = ../mips/dasum_msa.c CASUMKERNEL = ../mips/casum_msa.c ZASUMKERNEL = ../mips/zasum_msa.c endif -ifdef HAVE_MSA +ifndef NO_MSA SSWAPKERNEL = ../mips/sswap_msa.c DSWAPKERNEL = ../mips/dswap_msa.c CSWAPKERNEL = ../mips/cswap_msa.c ZSWAPKERNEL = ../mips/zswap_msa.c endif -ifdef HAVE_MSA +ifndef NO_MSA SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c @@ -89,7 +89,7 @@ SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) endif -ifdef HAVE_MSA +ifndef NO_MSA DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c @@ -107,7 +107,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) endif -ifdef HAVE_MSA +ifndef NO_MSA CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c @@ -129,7 +129,7 @@ CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) endif -ifdef HAVE_MSA +ifndef NO_MSA ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c @@ -143,7 +143,7 @@ ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) endif -ifdef HAVE_MSA +ifndef NO_MSA STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c @@ -155,7 +155,7 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif -ifdef HAVE_MSA +ifndef NO_MSA DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c @@ -167,7 +167,7 @@ DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif -ifdef HAVE_MSA +ifndef NO_MSA CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c @@ -179,7 +179,7 @@ CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif -ifdef HAVE_MSA +ifndef NO_MSA ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 9047c714c..c84cd91d2 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -25,7 +25,7 @@ ZTRMMKERNEL = zgemm_kernel_power10.S endif SGEMMKERNEL = sgemm_kernel_power10.c -SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMINCOPY = sgemm_ncopy_16_power.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c SGEMMOTCOPY = sgemm_tcopy_8_power8.S diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 2b8e65948..700a68e44 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -50,7 +50,7 @@ CTRMMKERNEL = ctrmm_kernel_8x4_power8.S ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S SGEMMKERNEL = sgemm_kernel_16x8_power8.S -SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMINCOPY = sgemm_ncopy_16_power.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c SGEMMOTCOPY = sgemm_tcopy_8_power8.S diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index b6b102b3e..7d007d1a2 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -13,7 +13,7 @@ CTRMMKERNEL = cgemm_kernel_power9.S ZTRMMKERNEL = zgemm_kernel_power9.S SGEMMKERNEL = sgemm_kernel_power9.S -SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMINCOPY = sgemm_ncopy_16_power.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c SGEMMOTCOPY = sgemm_tcopy_8_power8.S diff --git a/kernel/power/KERNEL.PPC970 b/kernel/power/KERNEL.PPC970 index a99fb7d96..fee5fa529 100644 --- a/kernel/power/KERNEL.PPC970 +++ b/kernel/power/KERNEL.PPC970 @@ -1,11 +1,11 @@ ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) SGEMMKERNEL = gemm_kernel.S -SGEMMINCOPY = -SGEMMITCOPY = +SGEMMINCOPY = +SGEMMITCOPY = SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMINCOPYOBJ = -SGEMMITCOPYOBJ = +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) else diff --git a/kernel/power/KERNEL.PPCG4 b/kernel/power/KERNEL.PPCG4 index 1bdd3119e..c73601cee 100644 --- a/kernel/power/KERNEL.PPCG4 +++ b/kernel/power/KERNEL.PPCG4 @@ -96,9 +96,9 @@ CGEMMINCOPY = CGEMMONCOPY = CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMINCOPYOBJ = +CGEMMINCOPYOBJ = #cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = +CGEMMITCOPYOBJ = #cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/power/dgemm_tcopy_16_power8.S b/kernel/power/dgemm_tcopy_16_power8.S index fc52e0202..7a3c04e8e 100644 --- a/kernel/power/dgemm_tcopy_16_power8.S +++ b/kernel/power/dgemm_tcopy_16_power8.S @@ -107,6 +107,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define o0 0 +#ifdef POWER10 +#include "dgemm_tcopy_macros_16_power10.S" +#endif #include "dgemm_tcopy_macros_16_power8.S" #define STACKSIZE 144 diff --git a/kernel/power/dgemm_tcopy_macros_16_power10.S b/kernel/power/dgemm_tcopy_macros_16_power10.S new file mode 100644 index 000000000..3cb416e09 --- /dev/null +++ b/kernel/power/dgemm_tcopy_macros_16_power10.S @@ -0,0 +1,655 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/21 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/********************************************************************************************** +* Macros for N=4 and M=16 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_4x16', ` +#else +.macro COPY_4x16 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + addi A0, A0, 64 + + lxvpx vs40, o0, A1 + lxvpx vs42, o32, A1 + addi A1, A1, 64 + + lxvpx vs48, o0, A2 + lxvpx vs50, o32, A2 + addi A2, A2, 64 + + lxvpx vs4, o0, A3 + lxvpx vs6, o32, A3 + addi A3, A3, 64 + + lxvpx vs36, o0, A0 + lxvpx vs38, o32, A0 + addi A0, A0, 64 + + lxvpx vs44, o0, A1 + lxvpx vs46, o32, A1 + addi A1, A1, 64 + + lxvpx vs12, o0, A2 + lxvpx vs2, o32, A2 + addi A2, A2, 64 + + lxvpx vs8, o0, A3 + lxvpx vs10, o32, A3 + addi A3, A3, 64 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 + stxvd2x vs35, o32, T1 + stxvd2x vs34, o48, T1 +#endif + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 +#else + stxvd2x vs37, o0, T1 + stxvd2x vs36, o16, T1 + stxvd2x vs39, o32, T1 + stxvd2x vs38, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 +#else + stxvd2x vs41, o0, T1 + stxvd2x vs40, o16, T1 + stxvd2x vs43, o32, T1 + stxvd2x vs42, o48, T1 +#endif + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 +#else + stxvd2x vs45, o0, T1 + stxvd2x vs44, o16, T1 + stxvd2x vs47, o32, T1 + stxvd2x vs46, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs48, o0, T1 + stxvd2x vs49, o16, T1 + stxvd2x vs50, o32, T1 + stxvd2x vs51, o48, T1 +#else + stxvd2x vs49, o0, T1 + stxvd2x vs48, o16, T1 + stxvd2x vs51, o32, T1 + stxvd2x vs50, o48, T1 +#endif + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs12, o0, T1 + stxvd2x vs13, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 +#else + stxvd2x vs13, o0, T1 + stxvd2x vs12, o16, T1 + stxvd2x vs3, o32, T1 + stxvd2x vs2, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs4, o0, T1 + stxvd2x vs5, o16, T1 + stxvd2x vs6, o32, T1 + stxvd2x vs7, o48, T1 +#else + stxvd2x vs5, o0, T1 + stxvd2x vs4, o16, T1 + stxvd2x vs7, o32, T1 + stxvd2x vs6, o48, T1 +#endif + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 +#else + stxvd2x vs9, o0, T1 + stxvd2x vs8, o16, T1 + stxvd2x vs11, o32, T1 + stxvd2x vs10, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_4x8', ` +#else +.macro COPY_4x8 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + addi A0, A0, 64 + + + lxvpx vs36, o0, A1 + lxvpx vs38, o32, A1 + addi A1, A1, 64 + + + lxvpx vs40, o0, A2 + lxvpx vs42, o32, A2 + addi A2, A2, 64 + + + lxvpx vs44, o0, A3 + lxvpx vs46, o32, A3 + addi A3, A3, 64 + + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 + stxvd2x vs35, o32, T1 + stxvd2x vs34, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 +#else + stxvd2x vs37, o0, T1 + stxvd2x vs36, o16, T1 + stxvd2x vs39, o32, T1 + stxvd2x vs38, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 +#else + stxvd2x vs41, o0, T1 + stxvd2x vs40, o16, T1 + stxvd2x vs43, o32, T1 + stxvd2x vs42, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 +#else + stxvd2x vs45, o0, T1 + stxvd2x vs44, o16, T1 + stxvd2x vs47, o32, T1 + stxvd2x vs46, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_4x4', ` +#else +.macro COPY_4x4 +#endif + + lxvpx vs32, o0, A0 + addi A0, A0, 32 + + + lxvpx vs34, o0, A1 + addi A1, A1, 32 + + + lxvpx vs36, o0, A2 + addi A2, A2, 32 + + + lxvpx vs38, o0, A3 + addi A3, A3, 32 + + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 + + stxvd2x vs35, o32, T1 + stxvd2x vs34, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 +#else + stxvd2x vs37, o0, T1 + stxvd2x vs36, o16, T1 + + stxvd2x vs39, o32, T1 + stxvd2x vs38, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + + +/********************************************************************************************** +* Macros for N=2 and M=16 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_2x16', ` +#else +.macro COPY_2x16 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + addi A0, A0, 64 + + lxvpx vs36, o0, A0 + lxvpx vs38, o32, A0 + addi A0, A0, 64 + + + lxvpx vs40, o0, A1 + lxvpx vs42, o32, A1 + addi A1, A1, 64 + + lxvpx vs44, o0, A1 + lxvpx vs46, o32, A1 + addi A1, A1, 64 + + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 + stxvd2x vs35, o32, T1 + stxvd2x vs34, o48, T1 +#endif + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 +#else + stxvd2x vs37, o0, T1 + stxvd2x vs36, o16, T1 + stxvd2x vs39, o32, T1 + stxvd2x vs38, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 +#else + stxvd2x vs41, o0, T1 + stxvd2x vs40, o16, T1 + stxvd2x vs43, o32, T1 + stxvd2x vs42, o48, T1 +#endif + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 +#else + stxvd2x vs45, o0, T1 + stxvd2x vs44, o16, T1 + stxvd2x vs47, o32, T1 + stxvd2x vs46, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_2x8', ` +#else +.macro COPY_2x8 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + addi A0, A0, 64 + + + lxvpx vs36, o0, A1 + lxvpx vs38, o0, A1 + addi A1, A1, 64 + + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 + stxvd2x vs35, o32, T1 + stxvd2x vs34, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 +#else + stxvd2x vs37, o0, T1 + stxvd2x vs36, o16, T1 + stxvd2x vs39, o32, T1 + stxvd2x vs38, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_2x4', ` +#else +.macro COPY_2x4 +#endif + + lxvpx vs32, o0, A0 + addi A0, A0, 32 + + + lxvpx vs34, o0, A1 + addi A1, A1, 32 + + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 + + stxvd2x vs35, o32, T1 + stxvd2x vs34, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + + +/********************************************************************************************** +* Macros for N=1 and M=16 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_1x16', ` +#else +.macro COPY_1x16 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + addi A0, A0, 64 + + lxvpx vs36, o0, A0 + lxvpx vs38, o0, A0 + addi A0, A0, 64 + + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 + stxvd2x vs35, o32, T1 + stxvd2x vs34, o48, T1 +#endif + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 +#else + stxvd2x vs37, o0, T1 + stxvd2x vs36, o16, T1 + stxvd2x vs39, o32, T1 + stxvd2x vs38, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_1x8', ` +#else +.macro COPY_1x8 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + addi A0, A0, 64 + + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 + stxvd2x vs35, o32, T1 + stxvd2x vs34, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_1x4', ` +#else +.macro COPY_1x4 +#endif + + lxvpx vs32, o0, A0 + addi A0, A0, 32 + + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + diff --git a/kernel/power/dgemm_tcopy_macros_16_power8.S b/kernel/power/dgemm_tcopy_macros_16_power8.S index 6c5b8ed62..93b488fd0 100644 --- a/kernel/power/dgemm_tcopy_macros_16_power8.S +++ b/kernel/power/dgemm_tcopy_macros_16_power8.S @@ -38,6 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4 and M=16 **********************************************************************************************/ +#ifndef POWER10 #if defined(_AIX) define(`COPY_4x16', ` #else @@ -275,6 +276,7 @@ define(`COPY_4x4', ` #else .endm #endif +#endif /********************************************************************************************** @@ -369,6 +371,7 @@ define(`COPY_4x1', ` * Macros for N=2 and M=16 **********************************************************************************************/ +#ifndef POWER10 #if defined(_AIX) define(`COPY_2x16', ` #else @@ -512,6 +515,7 @@ define(`COPY_2x4', ` #else .endm #endif +#endif /********************************************************************************************** @@ -580,6 +584,7 @@ define(`COPY_2x1', ` * Macros for N=1 and M=16 **********************************************************************************************/ +#ifndef POWER10 #if defined(_AIX) define(`COPY_1x16', ` #else @@ -675,6 +680,7 @@ define(`COPY_1x4', ` #else .endm #endif +#endif /********************************************************************************************** diff --git a/kernel/power/sgemm_ncopy_16_power.c b/kernel/power/sgemm_ncopy_16_power.c new file mode 100755 index 000000000..babe1376e --- /dev/null +++ b/kernel/power/sgemm_ncopy_16_power.c @@ -0,0 +1,482 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + IFLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12; + IFLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16; + + IFLOAT *boffset; + IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; + IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; + IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; + IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; + IFLOAT ctemp17, ctemp19 ; + IFLOAT ctemp21, ctemp23 ; + IFLOAT ctemp25, ctemp27 ; + IFLOAT ctemp29, ctemp31 ; + + aoffset = a; + boffset = b; + j = (n >> 4); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset9 = aoffset8 + lda; + aoffset10 = aoffset9 + lda; + aoffset11 = aoffset10 + lda; + aoffset12 = aoffset11 + lda; + aoffset13 = aoffset12 + lda; + aoffset14 = aoffset13 + lda; + aoffset15 = aoffset14 + lda; + aoffset16 = aoffset15 + lda; + aoffset += 16 * lda; + i = (m >> 2); + if (i > 0){ + vector float c1, c2, c3, c4, c5, c6, c7, c8; + vector float c9, c10, c11, c12, c13, c14, c15, c16; + vector float t1, t2, t3, t4, t5, t6, t7, t8; + vector float t9, t10, t11, t12; + do{ + c1 = vec_xl(0, aoffset1); + c2 = vec_xl(0, aoffset2); + c3 = vec_xl(0, aoffset3); + c4 = vec_xl(0, aoffset4); + c5 = vec_xl(0, aoffset5); + c6 = vec_xl(0, aoffset6); + c7 = vec_xl(0, aoffset7); + c8 = vec_xl(0, aoffset8); + c9 = vec_xl(0, aoffset9); + c10 = vec_xl(0, aoffset10); + c11 = vec_xl(0, aoffset11); + c12 = vec_xl(0, aoffset12); + c13 = vec_xl(0, aoffset13); + c14 = vec_xl(0, aoffset14); + c15 = vec_xl(0, aoffset15); + c16 = vec_xl(0, aoffset16); + + t1 = vec_mergeh(c1, c2); + t2 = vec_mergeh(c3, c4); + t3 = vec_mergeh(c5, c6); + t4 = vec_mergeh(c7, c8); + t9 = vec_mergeh(c9, c10); + t10 = vec_mergeh(c11, c12); + t11 = vec_mergeh(c13, c14); + t12 = vec_mergeh(c15, c16); + + t5 = vec_xxpermdi(t1, t2, 0b00); + t6 = vec_xxpermdi(t3, t4, 0b00); + t7 = vec_xxpermdi(t9, t10, 0b00); + t8 = vec_xxpermdi(t11, t12, 0b00); + + vec_xst(t5, 0, boffset); + vec_xst(t6, 0, boffset+4); + vec_xst(t7, 0, boffset+8); + vec_xst(t8, 0, boffset+12); + t5 = vec_xxpermdi(t1, t2, 0b11); + t6 = vec_xxpermdi(t3, t4, 0b11); + t7 = vec_xxpermdi(t9, t10, 0b11); + t8 = vec_xxpermdi(t11, t12, 0b11); + vec_xst(t5, 0, boffset+16); + vec_xst(t6, 0, boffset+20); + vec_xst(t7, 0, boffset+24); + vec_xst(t8, 0, boffset+28); + + t1 = vec_mergel(c1, c2); + t2 = vec_mergel(c3, c4); + t3 = vec_mergel(c5, c6); + t4 = vec_mergel(c7, c8); + t9 = vec_mergel(c9, c10); + t10 = vec_mergel(c11, c12); + t11 = vec_mergel(c13, c14); + t12 = vec_mergel(c15, c16); + t5 = vec_xxpermdi(t1, t2, 0b00); + t6 = vec_xxpermdi(t3, t4, 0b00); + t7 = vec_xxpermdi(t9, t10, 0b00); + t8 = vec_xxpermdi(t11, t12, 0b00); + vec_xst(t5, 0, boffset+32); + vec_xst(t6, 0, boffset+36); + vec_xst(t7, 0, boffset+40); + vec_xst(t8, 0, boffset+44); + + t5 = vec_xxpermdi(t1, t2, 0b11); + t6 = vec_xxpermdi(t3, t4, 0b11); + t7 = vec_xxpermdi(t9, t10, 0b11); + t8 = vec_xxpermdi(t11, t12, 0b11); + vec_xst(t5, 0, boffset+48); + vec_xst(t6, 0, boffset+52); + vec_xst(t7, 0, boffset+56); + vec_xst(t8, 0, boffset+60); + + aoffset1 += 4; + aoffset2 += 4; + aoffset3 += 4; + aoffset4 += 4; + aoffset5 += 4; + aoffset6 += 4; + aoffset7 += 4; + aoffset8 += 4; + + aoffset9 += 4; + aoffset10 += 4; + aoffset11 += 4; + aoffset12 += 4; + aoffset13 += 4; + aoffset14 += 4; + aoffset15 += 4; + aoffset16 += 4; + boffset += 64; + + i --; + }while(i > 0); + } + i = (m & 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + ctemp05 = *(aoffset3 + 0); + ctemp07 = *(aoffset4 + 0); + ctemp09 = *(aoffset5 + 0); + ctemp11 = *(aoffset6 + 0); + ctemp13 = *(aoffset7 + 0); + ctemp15 = *(aoffset8 + 0); + + ctemp17 = *(aoffset9 + 0); + ctemp19 = *(aoffset10 + 0); + ctemp21 = *(aoffset11 + 0); + ctemp23 = *(aoffset12 + 0); + ctemp25 = *(aoffset13 + 0); + ctemp27 = *(aoffset14 + 0); + ctemp29 = *(aoffset15 + 0); + ctemp31 = *(aoffset16 + 0); + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + *(boffset + 4) = ctemp09; + *(boffset + 5) = ctemp11; + *(boffset + 6) = ctemp13; + *(boffset + 7) = ctemp15; + + *(boffset + 8) = ctemp17; + *(boffset + 9) = ctemp19; + *(boffset + 10) = ctemp21; + *(boffset + 11) = ctemp23; + *(boffset + 12) = ctemp25; + *(boffset + 13) = ctemp27; + *(boffset + 14) = ctemp29; + *(boffset + 15) = ctemp31; + aoffset1+=1; + aoffset2+=1; + aoffset3+=1; + aoffset4+=1; + aoffset5+=1; + aoffset6+=1; + aoffset7+=1; + aoffset8+=1; + aoffset9+=1; + aoffset10+=1; + aoffset11+=1; + aoffset12+=1; + aoffset13+=1; + aoffset14+=1; + aoffset15+=1; + aoffset16+=1; + boffset += 16; + i --; + }while(i > 0); + } + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 8){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + + i = (m >> 2); + if (i > 0){ + vector float c1, c2, c3, c4, c5, c6, c7, c8; + vector float t1, t2, t3, t4, t5, t6, t7, t8; + do{ + c1 = vec_xl(0, aoffset1); + c2 = vec_xl(0, aoffset2); + c3 = vec_xl(0, aoffset3); + c4 = vec_xl(0, aoffset4); + c5 = vec_xl(0, aoffset5); + c6 = vec_xl(0, aoffset6); + c7 = vec_xl(0, aoffset7); + c8 = vec_xl(0, aoffset8); + + t1 = vec_mergeh(c1, c2); + t2 = vec_mergeh(c3, c4); + t3 = vec_mergeh(c5, c6); + t4 = vec_mergeh(c7, c8); + + t5 = vec_xxpermdi(t1, t2, 0b00); + t6 = vec_xxpermdi(t3, t4, 0b00); + t7 = vec_xxpermdi(t1, t2, 0b11); + t8 = vec_xxpermdi(t3, t4, 0b11); + + vec_xst(t5, 0, boffset); + vec_xst(t6, 0, boffset+4); + vec_xst(t7, 0, boffset+8); + vec_xst(t8, 0, boffset+12); + + t1 = vec_mergel(c1, c2); + t2 = vec_mergel(c3, c4); + t3 = vec_mergel(c5, c6); + t4 = vec_mergel(c7, c8); + + t5 = vec_xxpermdi(t1, t2, 0b00); + t6 = vec_xxpermdi(t3, t4, 0b00); + t7 = vec_xxpermdi(t1, t2, 0b11); + t8 = vec_xxpermdi(t3, t4, 0b11); + + vec_xst(t5, 0, boffset+16); + vec_xst(t6, 0, boffset+20); + vec_xst(t7, 0, boffset+24); + vec_xst(t8, 0, boffset+28); + + aoffset1 += 4; + aoffset2 += 4; + aoffset3 += 4; + aoffset4 += 4; + aoffset5 += 4; + aoffset6 += 4; + aoffset7 += 4; + aoffset8 += 4; + + boffset += 32; + i--; + }while(i > 0); + } + + i = (m & 3); + if (i > 0) { + do { + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + ctemp05 = *(aoffset3 + 0); + ctemp07 = *(aoffset4 + 0); + ctemp09 = *(aoffset5 + 0); + ctemp11 = *(aoffset6 + 0); + ctemp13 = *(aoffset7 + 0); + ctemp15 = *(aoffset8 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + *(boffset + 4) = ctemp09; + *(boffset + 5) = ctemp11; + *(boffset + 6) = ctemp13; + *(boffset + 7) = ctemp15; + + aoffset1+=1; + aoffset2+=1; + aoffset3+=1; + aoffset4+=1; + aoffset5+=1; + aoffset6+=1; + aoffset7+=1; + aoffset8+=1; + + boffset += 8; + i--; + } while (i > 0); + } + } + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + i = (m >> 2); + if (i > 0){ + vector float c1, c2, c3, c4; + vector float t1, t2, t3, t4; + do{ + c1 = vec_xl(0, aoffset1); + c2 = vec_xl(0, aoffset2); + c3 = vec_xl(0, aoffset3); + c4 = vec_xl(0, aoffset4); + + t1 = vec_mergeh(c1, c2); + t2 = vec_mergeh(c3, c4); + + t3 = vec_xxpermdi(t1, t2, 0b00); + t4 = vec_xxpermdi(t1, t2, 0b11); + + vec_xst(t3, 0, boffset); + vec_xst(t4, 0, boffset+4); + + t1 = vec_mergel(c1, c2); + t2 = vec_mergel(c3, c4); + + t3 = vec_xxpermdi(t1, t2, 0b00); + t4 = vec_xxpermdi(t1, t2, 0b11); + + vec_xst(t3, 0, boffset+8); + vec_xst(t4, 0, boffset+12); + + aoffset1 += 4; + aoffset2 += 4; + aoffset3 += 4; + aoffset4 += 4; + + boffset += 16; + i--; + }while(i > 0); + } + + i = (m & 3); + if (i > 0) { + do { + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + ctemp05 = *(aoffset3 + 0); + ctemp07 = *(aoffset4 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + + aoffset1+=1; + aoffset2+=1; + aoffset3+=1; + aoffset4+=1; + + boffset += 4; + i--; + } while (i > 0); + } + } + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp02; + *(boffset + 3) = ctemp04; + + aoffset1 += 2; + aoffset2 += 2; + boffset += 4; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + boffset += 2; + } + } + + if (n & 1){ + aoffset1 = aoffset; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + + aoffset1 += 2; + boffset += 2; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + + *(boffset + 0) = ctemp01; + // boffset += 1; + } + } + + return 0; +} diff --git a/kernel/power/sgemm_tcopy_16_power8.S b/kernel/power/sgemm_tcopy_16_power8.S index b9f6d63fb..6d2c6a555 100644 --- a/kernel/power/sgemm_tcopy_16_power8.S +++ b/kernel/power/sgemm_tcopy_16_power8.S @@ -108,6 +108,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define o0 0 +#ifdef POWER10 +#include "sgemm_tcopy_macros_16_power10.S" +#endif #include "sgemm_tcopy_macros_16_power8.S" #define STACKSIZE 144 diff --git a/kernel/power/sgemm_tcopy_macros_16_power10.S b/kernel/power/sgemm_tcopy_macros_16_power10.S new file mode 100644 index 000000000..dca37e48a --- /dev/null +++ b/kernel/power/sgemm_tcopy_macros_16_power10.S @@ -0,0 +1,323 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/21 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/********************************************************************************************** +* Macros for N=4 and M=16 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_4x16', ` +#else +.macro COPY_4x16 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + + lxvpx vs36, o0, A1 + lxvpx vs38, o32, A1 + + lxvpx vs40, o0, A2 + lxvpx vs42, o32, A2 + + lxvpx vs44, o0, A3 + lxvpx vs46, o32, A3 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs32, o0, T1 + stxvx vs33, o16, T1 + stxvx vs34, o32, T1 + stxvx vs35, o48, T1 +#else + stxvx vs33, o0, T1 + stxvx vs32, o16, T1 + stxvx vs35, o32, T1 + stxvx vs34, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs36, o0, T1 + stxvx vs37, o16, T1 + stxvx vs38, o32, T1 + stxvx vs39, o48, T1 +#else + stxvx vs37, o0, T1 + stxvx vs36, o16, T1 + stxvx vs39, o32, T1 + stxvx vs38, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs40, o0, T1 + stxvx vs41, o16, T1 + stxvx vs42, o32, T1 + stxvx vs43, o48, T1 +#else + stxvx vs41, o0, T1 + stxvx vs40, o16, T1 + stxvx vs43, o32, T1 + stxvx vs42, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs44, o0, T1 + stxvx vs45, o16, T1 + stxvx vs46, o32, T1 + stxvx vs47, o48, T1 +#else + stxvx vs45, o0, T1 + stxvx vs44, o16, T1 + stxvx vs47, o32, T1 + stxvx vs46, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_4x8', ` +#else +.macro COPY_4x8 +#endif + + lxvpx vs32, o0, A0 + + lxvpx vs34, o0, A1 + + lxvpx vs36, o0, A2 + + lxvpx vs38, o0, A3 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs32, o0, T1 + stxvx vs33, o16, T1 + + stxvx vs34, o32, T1 + stxvx vs35, o48, T1 +#else + stxvx vs33, o0, T1 + stxvx vs32, o16, T1 + + stxvx vs35, o32, T1 + stxvx vs34, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs36, o0, T1 + stxvx vs37, o16, T1 + + stxvx vs38, o32, T1 + stxvx vs39, o48, T1 +#else + stxvx vs37, o0, T1 + stxvx vs36, o16, T1 + + stxvx vs39, o32, T1 + stxvx vs38, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + +/********************************************************************************************** +* Macros for N=2 and M=16 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_2x16', ` +#else +.macro COPY_2x16 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + + lxvpx vs36, o0, A1 + lxvpx vs38, o32, A1 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs32, o0, T1 + stxvx vs33, o16, T1 + stxvx vs34, o32, T1 + stxvx vs35, o48, T1 +#else + stxvx vs33, o0, T1 + stxvx vs32, o16, T1 + stxvx vs35, o32, T1 + stxvx vs34, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs36, o0, T1 + stxvx vs37, o16, T1 + stxvx vs38, o32, T1 + stxvx vs39, o48, T1 +#else + stxvx vs37, o0, T1 + stxvx vs36, o16, T1 + stxvx vs39, o32, T1 + stxvx vs38, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_2x8', ` +#else +.macro COPY_2x8 +#endif + + lxvpx vs32, o0, A0 + + lxvpx vs34, o0, A1 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs32, o0, T1 + stxvx vs33, o16, T1 + + stxvx vs34, o32, T1 + stxvx vs35, o48, T1 +#else + stxvx vs33, o0, T1 + stxvx vs32, o16, T1 + + stxvx vs35, o32, T1 + stxvx vs34, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + +/********************************************************************************************** +* Macros for N=1 and M=16 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_1x16', ` +#else +.macro COPY_1x16 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs32, o0, T1 + stxvx vs33, o16, T1 + stxvx vs34, o32, T1 + stxvx vs35, o48, T1 +#else + stxvx vs33, o0, T1 + stxvx vs32, o16, T1 + stxvx vs35, o32, T1 + stxvx vs34, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_1x8', ` +#else +.macro COPY_1x8 +#endif + + lxvpx vs32, o0, A0 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs32, o0, T1 + stxvx vs33, o16, T1 +#else + stxvx vs33, o0, T1 + stxvx vs32, o16, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + diff --git a/kernel/power/sgemm_tcopy_macros_16_power8.S b/kernel/power/sgemm_tcopy_macros_16_power8.S index ed592a604..af237d5ee 100644 --- a/kernel/power/sgemm_tcopy_macros_16_power8.S +++ b/kernel/power/sgemm_tcopy_macros_16_power8.S @@ -38,6 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4 and M=16 **********************************************************************************************/ +#ifndef POWER10 #if defined(_AIX) define(`COPY_4x16', ` #else @@ -141,6 +142,7 @@ define(`COPY_4x8', ` #else .endm #endif +#endif /********************************************************************************************** * Macros for N=4 and M=4 @@ -264,6 +266,7 @@ define(`COPY_4x1', ` * Macros for N=2 and M=16 **********************************************************************************************/ +#ifndef POWER10 #if defined(_AIX) define(`COPY_2x16', ` #else @@ -329,6 +332,7 @@ define(`COPY_2x8', ` #else .endm #endif +#endif /********************************************************************************************** * Macros for N=2 and M=4 @@ -418,6 +422,7 @@ define(`COPY_2x1', ` * Macros for N=1 and M=16 **********************************************************************************************/ +#ifndef POWER10 #if defined(_AIX) define(`COPY_1x16', ` #else @@ -465,6 +470,7 @@ define(`COPY_1x8', ` #else .endm #endif +#endif /********************************************************************************************** * Macros for N=1 and M=4 diff --git a/kernel/riscv64/KERNEL.C910V b/kernel/riscv64/KERNEL.C910V index e6f2b3314..2798a870e 100644 --- a/kernel/riscv64/KERNEL.C910V +++ b/kernel/riscv64/KERNEL.C910V @@ -42,8 +42,8 @@ ZSUMKERNEL = ../arm/zsum.c SAXPYKERNEL = axpy_vector.c DAXPYKERNEL = axpy_vector.c -CAXPYKERNEL = zaxpy.c -ZAXPYKERNEL = zaxpy.c +CAXPYKERNEL = zaxpy_vector.c +ZAXPYKERNEL = zaxpy_vector.c SAXPBYKERNEL = axpby_vector.c DAXPBYKERNEL = axpby_vector.c @@ -59,7 +59,7 @@ SDOTKERNEL = dot_vector.c DDOTKERNEL = dot_vector.c CDOTKERNEL = zdot_vector.c ZDOTKERNEL = zdot_vector.c -DSDOTKERNEL = ../generic/dot.c +DSDOTKERNEL = dsdot_vector.c SNRM2KERNEL = nrm2_vector.c DNRM2KERNEL = nrm2_vector.c diff --git a/kernel/riscv64/KERNEL.RISCV64_GENERIC b/kernel/riscv64/KERNEL.RISCV64_GENERIC index 61a8a2b91..15bcd2289 100644 --- a/kernel/riscv64/KERNEL.RISCV64_GENERIC +++ b/kernel/riscv64/KERNEL.RISCV64_GENERIC @@ -45,6 +45,11 @@ DAXPYKERNEL = ../riscv64/axpy.c CAXPYKERNEL = ../riscv64/zaxpy.c ZAXPYKERNEL = ../riscv64/zaxpy.c +SAXPBYKERNEL = ../riscv64/axpby.c +DAXPBYKERNEL = ../riscv64/axpby.c +CAXPBYKERNEL = ../riscv64/zaxpby.c +ZAXPBYKERNEL = ../riscv64/zaxpby.c + SCOPYKERNEL = ../riscv64/copy.c DCOPYKERNEL = ../riscv64/copy.c CCOPYKERNEL = ../riscv64/zcopy.c diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL128B b/kernel/riscv64/KERNEL.RISCV64_ZVL128B new file mode 100644 index 000000000..fec69ee09 --- /dev/null +++ b/kernel/riscv64/KERNEL.RISCV64_ZVL128B @@ -0,0 +1,243 @@ +SAMAXKERNEL = amax_rvv.c +DAMAXKERNEL = amax_rvv.c +CAMAXKERNEL = zamax_rvv.c +ZAMAXKERNEL = zamax_rvv.c + +SAMINKERNEL = amin_rvv.c +DAMINKERNEL = amin_rvv.c +CAMINKERNEL = zamin_rvv.c +ZAMINKERNEL = zamin_rvv.c + +SMAXKERNEL = max_rvv.c +DMAXKERNEL = max_rvv.c + +SMINKERNEL = min_rvv.c +DMINKERNEL = min_rvv.c + +ISAMAXKERNEL = iamax_rvv.c +IDAMAXKERNEL = iamax_rvv.c +ICAMAXKERNEL = izamax_rvv.c +IZAMAXKERNEL = izamax_rvv.c + +ISAMINKERNEL = iamin_rvv.c +IDAMINKERNEL = iamin_rvv.c +ICAMINKERNEL = izamin_rvv.c +IZAMINKERNEL = izamin_rvv.c + +ISMAXKERNEL = imax_rvv.c +IDMAXKERNEL = imax_rvv.c + +ISMINKERNEL = imin_rvv.c +IDMINKERNEL = imin_rvv.c + +SASUMKERNEL = asum_rvv.c +DASUMKERNEL = asum_rvv.c +CASUMKERNEL = zasum_rvv.c +ZASUMKERNEL = zasum_rvv.c + +SSUMKERNEL = sum_rvv.c +DSUMKERNEL = sum_rvv.c +CSUMKERNEL = zsum_rvv.c +ZSUMKERNEL = zsum_rvv.c + +SAXPYKERNEL = axpy_rvv.c +DAXPYKERNEL = axpy_rvv.c +CAXPYKERNEL = zaxpy_rvv.c +ZAXPYKERNEL = zaxpy_rvv.c + +SAXPBYKERNEL = axpby_rvv.c +DAXPBYKERNEL = axpby_rvv.c +CAXPBYKERNEL = zaxpby_rvv.c +ZAXPBYKERNEL = zaxpby_rvv.c + +SCOPYKERNEL = copy_rvv.c +DCOPYKERNEL = copy_rvv.c +CCOPYKERNEL = zcopy_rvv.c +ZCOPYKERNEL = zcopy_rvv.c + +SDOTKERNEL = dot_rvv.c +DDOTKERNEL = dot_rvv.c +CDOTKERNEL = zdot_rvv.c +ZDOTKERNEL = zdot_rvv.c +DSDOTKERNEL = dot_rvv.c + +SNRM2KERNEL = nrm2_rvv.c +DNRM2KERNEL = nrm2_rvv.c +CNRM2KERNEL = znrm2_rvv.c +ZNRM2KERNEL = znrm2_rvv.c + +SROTKERNEL = rot_rvv.c +DROTKERNEL = rot_rvv.c +CROTKERNEL = zrot_rvv.c +ZROTKERNEL = zrot_rvv.c + +SSCALKERNEL = scal_rvv.c +DSCALKERNEL = scal_rvv.c +CSCALKERNEL = zscal_rvv.c +ZSCALKERNEL = zscal_rvv.c + +SSWAPKERNEL = swap_rvv.c +DSWAPKERNEL = swap_rvv.c +CSWAPKERNEL = zswap_rvv.c +ZSWAPKERNEL = zswap_rvv.c + +SGEMVNKERNEL = gemv_n_rvv.c +DGEMVNKERNEL = gemv_n_rvv.c +CGEMVNKERNEL = zgemv_n_rvv.c +ZGEMVNKERNEL = zgemv_n_rvv.c + +SGEMVTKERNEL = gemv_t_rvv.c +DGEMVTKERNEL = gemv_t_rvv.c +CGEMVTKERNEL = zgemv_t_rvv.c +ZGEMVTKERNEL = zgemv_t_rvv.c + +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c +STRMMUNCOPY_M = ../generic/trmm_uncopy_$(SGEMM_UNROLL_M).c +STRMMLNCOPY_M = ../generic/trmm_lncopy_$(SGEMM_UNROLL_M).c +STRMMUTCOPY_M = ../generic/trmm_utcopy_$(SGEMM_UNROLL_M).c +STRMMLTCOPY_M = ../generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c + +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c +DTRMMUNCOPY_M = ../generic/trmm_uncopy_$(DGEMM_UNROLL_M).c +DTRMMLNCOPY_M = ../generic/trmm_lncopy_$(DGEMM_UNROLL_M).c +DTRMMUTCOPY_M = ../generic/trmm_utcopy_$(DGEMM_UNROLL_M).c +DTRMMLTCOPY_M = ../generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c + +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c +CTRMMUNCOPY_M = ../generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c +CTRMMLNCOPY_M = ../generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c +CTRMMUTCOPY_M = ../generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c +CTRMMLTCOPY_M = ../generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c + +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c +ZTRMMUNCOPY_M = ../generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c +ZTRMMLNCOPY_M = ../generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c +ZTRMMUTCOPY_M = ../generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c +ZTRMMLTCOPY_M = ../generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SSYMV_U_KERNEL = symv_U_rvv.c +SSYMV_L_KERNEL = symv_L_rvv.c +DSYMV_U_KERNEL = symv_U_rvv.c +DSYMV_L_KERNEL = symv_L_rvv.c +CSYMV_U_KERNEL = zsymv_U_rvv.c +CSYMV_L_KERNEL = zsymv_L_rvv.c +ZSYMV_U_KERNEL = zsymv_U_rvv.c +ZSYMV_L_KERNEL = zsymv_L_rvv.c + +CHEMV_L_KERNEL = zhemv_LM_rvv.c +CHEMV_M_KERNEL = zhemv_LM_rvv.c +CHEMV_U_KERNEL = zhemv_UV_rvv.c +CHEMV_V_KERNEL = zhemv_UV_rvv.c +ZHEMV_L_KERNEL = zhemv_LM_rvv.c +ZHEMV_M_KERNEL = zhemv_LM_rvv.c +ZHEMV_U_KERNEL = zhemv_UV_rvv.c +ZHEMV_V_KERNEL = zhemv_UV_rvv.c + +SSYMMUCOPY_M = ../generic/symm_ucopy_$(SGEMM_UNROLL_M).c +SSYMMLCOPY_M = ../generic/symm_lcopy_$(SGEMM_UNROLL_M).c + +DSYMMUCOPY_M = ../generic/symm_ucopy_$(DGEMM_UNROLL_M).c +DSYMMLCOPY_M = ../generic/symm_lcopy_$(DGEMM_UNROLL_M).c + +CSYMMUCOPY_M = ../generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c +CSYMMLCOPY_M = ../generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c + +ZSYMMUCOPY_M = ../generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c +ZSYMMLCOPY_M = ../generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c + +CHEMMLTCOPY_M = ../generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c +CHEMMUTCOPY_M = ../generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c + +ZHEMMLTCOPY_M = ../generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c +ZHEMMUTCOPY_M = ../generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c + +LSAME_KERNEL = ../generic/lsame.c + +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +ifndef SGEMM_BETA +SGEMM_BETA = gemm_beta_rvv.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = gemm_beta_rvv.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = zgemm_beta_rvv.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = zgemm_beta_rvv.c +endif diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL256B b/kernel/riscv64/KERNEL.RISCV64_ZVL256B new file mode 100644 index 000000000..d8690682f --- /dev/null +++ b/kernel/riscv64/KERNEL.RISCV64_ZVL256B @@ -0,0 +1,199 @@ +SAMAXKERNEL = amax_vector.c +DAMAXKERNEL = amax_vector.c +CAMAXKERNEL = zamax_vector.c +ZAMAXKERNEL = zamax_vector.c + +SAMINKERNEL = amin_vector.c +DAMINKERNEL = amin_vector.c +CAMINKERNEL = zamin_vector.c +ZAMINKERNEL = zamin_vector.c + +SMAXKERNEL = max_vector.c +DMAXKERNEL = max_vector.c + +SMINKERNEL = min_vector.c +DMINKERNEL = min_vector.c + +ISAMAXKERNEL = iamax_vector.c +IDAMAXKERNEL = iamax_vector.c +ICAMAXKERNEL = izamax_vector.c +IZAMAXKERNEL = izamax_vector.c + +ISAMINKERNEL = iamin_vector.c +IDAMINKERNEL = iamin_vector.c +ICAMINKERNEL = izamin_vector.c +IZAMINKERNEL = izamin_vector.c + +ISMAXKERNEL = imax_vector.c +IDMAXKERNEL = imax_vector.c + +ISMINKERNEL = imin_vector.c +IDMINKERNEL = imin_vector.c + +SASUMKERNEL = asum_vector.c +DASUMKERNEL = asum_vector.c +CASUMKERNEL = zasum_vector.c +ZASUMKERNEL = zasum_vector.c + +SSUMKERNEL = sum_vector.c +DSUMKERNEL = sum_vector.c +CSUMKERNEL = zsum_vector.c +ZSUMKERNEL = zsum_vector.c + +SAXPYKERNEL = axpy_vector.c +DAXPYKERNEL = axpy_vector.c +CAXPYKERNEL = zaxpy_vector.c +ZAXPYKERNEL = zaxpy_vector.c + +SCOPYKERNEL = copy_vector.c +DCOPYKERNEL = copy_vector.c +CCOPYKERNEL = zcopy_vector.c +ZCOPYKERNEL = zcopy_vector.c + +SDOTKERNEL = dot_vector.c +DDOTKERNEL = dot_vector.c +CDOTKERNEL = zdot_vector.c +ZDOTKERNEL = zdot_vector.c +DSDOTKERNEL = ../generic/dot.c + +SNRM2KERNEL = nrm2_vector.c +DNRM2KERNEL = nrm2_vector.c +CNRM2KERNEL = znrm2_vector.c +ZNRM2KERNEL = znrm2_vector.c + +SROTKERNEL = rot_vector.c +DROTKERNEL = rot_vector.c +CROTKERNEL = zrot_vector.c +ZROTKERNEL = zrot_vector.c + +SSCALKERNEL = scal_vector.c +DSCALKERNEL = scal_vector.c +CSCALKERNEL = zscal_vector.c +ZSCALKERNEL = zscal_vector.c + +SSWAPKERNEL = swap_vector.c +DSWAPKERNEL = swap_vector.c +CSWAPKERNEL = zswap_vector.c +ZSWAPKERNEL = zswap_vector.c + +SGEMVNKERNEL = gemv_n_vector.c +DGEMVNKERNEL = gemv_n_vector.c +CGEMVNKERNEL = zgemv_n_vector.c +ZGEMVNKERNEL = zgemv_n_vector.c + +SGEMVTKERNEL = gemv_t_vector.c +DGEMVTKERNEL = gemv_t_vector.c +CGEMVTKERNEL = zgemv_t_vector.c +ZGEMVTKERNEL = zgemv_t_vector.c + +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c + +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SSYMV_U_KERNEL = symv_U_vector.c +SSYMV_L_KERNEL = symv_L_vector.c +DSYMV_U_KERNEL = symv_U_vector.c +DSYMV_L_KERNEL = symv_L_vector.c +CSYMV_U_KERNEL = ../generic/zsymv_k.c +CSYMV_L_KERNEL = ../generic/zsymv_k.c +ZSYMV_U_KERNEL = ../generic/zsymv_k.c +ZSYMV_L_KERNEL = ../generic/zsymv_k.c + +CHEMV_L_KERNEL = zhemv_LM_vector.c +CHEMV_M_KERNEL = zhemv_LM_vector.c +CHEMV_U_KERNEL = zhemv_UV_vector.c +CHEMV_V_KERNEL = zhemv_UV_vector.c +ZHEMV_L_KERNEL = zhemv_LM_vector.c +ZHEMV_M_KERNEL = zhemv_LM_vector.c +ZHEMV_U_KERNEL = zhemv_UV_vector.c +ZHEMV_V_KERNEL = zhemv_UV_vector.c + +LSAME_KERNEL = ../generic/lsame.c + +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +ifndef SGEMM_BETA +SGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = ../generic/zgemm_beta.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = ../generic/zgemm_beta.c +endif diff --git a/kernel/riscv64/KERNEL.x280 b/kernel/riscv64/KERNEL.x280 new file mode 100644 index 000000000..86708fe01 --- /dev/null +++ b/kernel/riscv64/KERNEL.x280 @@ -0,0 +1,281 @@ +# ********************************************************************************** +# Copyright (c) 2022, The OpenBLAS Project +# All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# 3. Neither the name of the OpenBLAS project nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ********************************************************************************** + +SAMAXKERNEL = amax_rvv.c +DAMAXKERNEL = amax_rvv.c +CAMAXKERNEL = zamax_rvv.c +ZAMAXKERNEL = zamax_rvv.c + +SAMINKERNEL = amin_rvv.c +DAMINKERNEL = amin_rvv.c +CAMINKERNEL = zamin_rvv.c +ZAMINKERNEL = zamin_rvv.c + +SMAXKERNEL = max_rvv.c +DMAXKERNEL = max_rvv.c + +SMINKERNEL = min_rvv.c +DMINKERNEL = min_rvv.c + +ISAMAXKERNEL = iamax_rvv.c +IDAMAXKERNEL = iamax_rvv.c +ICAMAXKERNEL = izamax_rvv.c +IZAMAXKERNEL = izamax_rvv.c + +ISAMINKERNEL = iamin_rvv.c +IDAMINKERNEL = iamin_rvv.c +ICAMINKERNEL = izamin_rvv.c +IZAMINKERNEL = izamin_rvv.c + +ISMAXKERNEL = imax_rvv.c +IDMAXKERNEL = imax_rvv.c + +ISMINKERNEL = imin_rvv.c +IDMINKERNEL = imin_rvv.c + +SASUMKERNEL = asum_rvv.c +DASUMKERNEL = asum_rvv.c +CASUMKERNEL = zasum_rvv.c +ZASUMKERNEL = zasum_rvv.c + +SSUMKERNEL = sum_rvv.c +DSUMKERNEL = sum_rvv.c +CSUMKERNEL = zsum_rvv.c +ZSUMKERNEL = zsum_rvv.c + +SAXPYKERNEL = axpy_rvv.c +DAXPYKERNEL = axpy_rvv.c +CAXPYKERNEL = zaxpy_rvv.c +ZAXPYKERNEL = zaxpy_rvv.c + +SAXPBYKERNEL = axpby_rvv.c +DAXPBYKERNEL = axpby_rvv.c +CAXPBYKERNEL = zaxpby_rvv.c +ZAXPBYKERNEL = zaxpby_rvv.c + +SCOPYKERNEL = copy_rvv.c +DCOPYKERNEL = copy_rvv.c +CCOPYKERNEL = zcopy_rvv.c +ZCOPYKERNEL = zcopy_rvv.c + +SDOTKERNEL = dot_rvv.c +DDOTKERNEL = dot_rvv.c +CDOTKERNEL = zdot_rvv.c +ZDOTKERNEL = zdot_rvv.c +DSDOTKERNEL = dot_rvv.c + +SNRM2KERNEL = nrm2_rvv.c +DNRM2KERNEL = nrm2_rvv.c +CNRM2KERNEL = znrm2_rvv.c +ZNRM2KERNEL = znrm2_rvv.c + +SROTKERNEL = rot_rvv.c +DROTKERNEL = rot_rvv.c +CROTKERNEL = zrot_rvv.c +ZROTKERNEL = zrot_rvv.c + +SSCALKERNEL = scal_rvv.c +DSCALKERNEL = scal_rvv.c +CSCALKERNEL = zscal_rvv.c +ZSCALKERNEL = zscal_rvv.c + +SSWAPKERNEL = swap_rvv.c +DSWAPKERNEL = swap_rvv.c +CSWAPKERNEL = zswap_rvv.c +ZSWAPKERNEL = zswap_rvv.c + +SGEMVNKERNEL = gemv_n_rvv.c +DGEMVNKERNEL = gemv_n_rvv.c +CGEMVNKERNEL = zgemv_n_rvv.c +ZGEMVNKERNEL = zgemv_n_rvv.c + +SGEMVTKERNEL = gemv_t_rvv.c +DGEMVTKERNEL = gemv_t_rvv.c +CGEMVTKERNEL = zgemv_t_rvv.c +ZGEMVTKERNEL = zgemv_t_rvv.c + +CTRMMKERNEL = ztrmmkernel_rvv_v1x4.c +ZTRMMKERNEL = ztrmmkernel_rvv_v1x4.c + +# SGEMM_UNROLL_N set in params.h +ifeq ($(SGEMM_UNROLL_N), 8) +# UNROLL_M is VLMAX +SGEMMKERNEL = gemmkernel_rvv_v1x8.c +SGEMMINCOPY = gemm_ncopy_rvv_v1.c +SGEMMITCOPY = gemm_tcopy_rvv_v1.c +SGEMMONCOPY = gemm_ncopy_$(SGEMM_UNROLL_N)_rvv.c +SGEMMOTCOPY = gemm_tcopy_$(SGEMM_UNROLL_N)_rvv.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRMMKERNEL = trmmkernel_rvv_v1x8.c + +STRMMUNCOPY_M = trmm_uncopy_rvv_v1.c +STRMMLNCOPY_M = trmm_lncopy_rvv_v1.c +STRMMUTCOPY_M = trmm_utcopy_rvv_v1.c +STRMMLTCOPY_M = trmm_ltcopy_rvv_v1.c + +SSYMMUCOPY_M = symm_ucopy_rvv_v1.c +SSYMMLCOPY_M = symm_lcopy_rvv_v1.c +endif + +# SGEMM_UNROLL_N set in params.h +ifeq ($(DGEMM_UNROLL_N), 8) +# UNROLL_M is VLMAX +DGEMMKERNEL = gemmkernel_rvv_v1x8.c +DGEMMINCOPY = gemm_ncopy_rvv_v1.c +DGEMMITCOPY = gemm_tcopy_rvv_v1.c +DGEMMONCOPY = gemm_ncopy_$(DGEMM_UNROLL_N)_rvv.c +DGEMMOTCOPY = gemm_tcopy_$(DGEMM_UNROLL_N)_rvv.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DTRMMKERNEL = trmmkernel_rvv_v1x8.c +DTRMMUNCOPY_M = trmm_uncopy_rvv_v1.c +DTRMMLNCOPY_M = trmm_lncopy_rvv_v1.c +DTRMMUTCOPY_M = trmm_utcopy_rvv_v1.c +DTRMMLTCOPY_M = trmm_ltcopy_rvv_v1.c + +DSYMMUCOPY_M = symm_ucopy_rvv_v1.c +DSYMMLCOPY_M = symm_lcopy_rvv_v1.c +endif + +CGEMMKERNEL = zgemmkernel_rvv_v1x4.c +CGEMMINCOPY = zgemm_ncopy_rvv_v1.c +CGEMMITCOPY = zgemm_tcopy_rvv_v1.c +CGEMMONCOPY = zgemm_ncopy_4_rvv.c +CGEMMOTCOPY = zgemm_tcopy_4_rvv.c + +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemmkernel_rvv_v1x4.c + +ZGEMMINCOPY = zgemm_ncopy_rvv_v1.c +ZGEMMITCOPY = zgemm_tcopy_rvv_v1.c +ZGEMMONCOPY = zgemm_ncopy_4_rvv.c +ZGEMMOTCOPY = zgemm_tcopy_4_rvv.c + +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c +STRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c +STRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c +STRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c + +DTRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c +DTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c +DTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c +DTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c + +CTRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c +CTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c +CTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c +CTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c + +ZTRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c +ZTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c +ZTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c +ZTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c + +TRSMCOPYLN_M = trsm_lncopy_rvv_v1.c +TRSMCOPYLT_M = trsm_ltcopy_rvv_v1.c +TRSMCOPYUN_M = trsm_uncopy_rvv_v1.c +TRSMCOPYUT_M = trsm_utcopy_rvv_v1.c + +ZTRSMCOPYLN_M = ztrsm_lncopy_rvv_v1.c +ZTRSMCOPYLT_M = ztrsm_ltcopy_rvv_v1.c +ZTRSMCOPYUN_M = ztrsm_uncopy_rvv_v1.c +ZTRSMCOPYUT_M = ztrsm_utcopy_rvv_v1.c + +SSYMV_U_KERNEL = symv_U_rvv.c +SSYMV_L_KERNEL = symv_L_rvv.c +DSYMV_U_KERNEL = symv_U_rvv.c +DSYMV_L_KERNEL = symv_L_rvv.c +CSYMV_U_KERNEL = zsymv_U_rvv.c +CSYMV_L_KERNEL = zsymv_L_rvv.c +ZSYMV_U_KERNEL = zsymv_U_rvv.c +ZSYMV_L_KERNEL = zsymv_L_rvv.c + +CHEMV_L_KERNEL = zhemv_LM_rvv.c +CHEMV_M_KERNEL = zhemv_LM_rvv.c +CHEMV_U_KERNEL = zhemv_UV_rvv.c +CHEMV_V_KERNEL = zhemv_UV_rvv.c +ZHEMV_L_KERNEL = zhemv_LM_rvv.c +ZHEMV_M_KERNEL = zhemv_LM_rvv.c +ZHEMV_U_KERNEL = zhemv_UV_rvv.c +ZHEMV_V_KERNEL = zhemv_UV_rvv.c + +ZHEMMLTCOPY_M = zhemm_ltcopy_rvv_v1.c +ZHEMMUTCOPY_M = zhemm_utcopy_rvv_v1.c + +CHEMMLTCOPY_M = zhemm_ltcopy_rvv_v1.c +CHEMMUTCOPY_M = zhemm_utcopy_rvv_v1.c + +ZSYMMUCOPY_M = zsymm_ucopy_rvv_v1.c +ZSYMMLCOPY_M = zsymm_lcopy_rvv_v1.c + +CSYMMUCOPY_M = zsymm_ucopy_rvv_v1.c +CSYMMLCOPY_M = zsymm_lcopy_rvv_v1.c + +ZTRMMUNCOPY_M = ztrmm_uncopy_rvv_v1.c +ZTRMMLNCOPY_M = ztrmm_lncopy_rvv_v1.c +ZTRMMUTCOPY_M = ztrmm_utcopy_rvv_v1.c +ZTRMMLTCOPY_M = ztrmm_ltcopy_rvv_v1.c + +CTRMMUNCOPY_M = ztrmm_uncopy_rvv_v1.c +CTRMMLNCOPY_M = ztrmm_lncopy_rvv_v1.c +CTRMMUTCOPY_M = ztrmm_utcopy_rvv_v1.c +CTRMMLTCOPY_M = ztrmm_ltcopy_rvv_v1.c + +LSAME_KERNEL = ../generic/lsame.c + +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +ifndef SGEMM_BETA +SGEMM_BETA = gemm_beta_rvv.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = gemm_beta_rvv.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = zgemm_beta_rvv.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = zgemm_beta_rvv.c +endif diff --git a/kernel/riscv64/amax_rvv.c b/kernel/riscv64/amax_rvv.c new file mode 100644 index 000000000..451fbc834 --- /dev/null +++ b/kernel/riscv64/amax_rvv.c @@ -0,0 +1,102 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m8_tu +#define VFABSV_FLOAT __riscv_vfabs_v_f32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m8_tu +#define VFABSV_FLOAT __riscv_vfabs_v_f64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT maxf = 0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + FLOAT_V_T vx, vmax; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmax = VFMVVF_FLOAT(0.0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vx = VFABSV_FLOAT(vx, vl); + vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vx = VFABSV_FLOAT(vx, vl); + vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl); + } + + } + + v_res = VFREDMAXVS_FLOAT(vmax, v_res, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + + return(maxf); +} diff --git a/kernel/riscv64/amax_vector.c b/kernel/riscv64/amax_vector.c index 1b7799340..b66d4871e 100644 --- a/kernel/riscv64/amax_vector.c +++ b/kernel/riscv64/amax_vector.c @@ -28,36 +28,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMAXVV_FLOAT vfmax_vv_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMAXVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) +#else +#define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#endif +#define VFABS_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0, j=0; @@ -65,103 +70,28 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT maxf=0.0; if (n <= 0 || inc_x <= 0) return(maxf); unsigned int gvl = 0; - FLOAT_V_T v0, v1, v_max; - FLOAT_V_T_M1 v_res, v_zero; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_zero = VFMVVF_FLOAT_M1(0, gvl); + FLOAT_V_T v0, v1; + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(0, 1); - MASK_T mask0, mask1; - FLOAT zero = 0.0; if(inc_x == 1){ gvl = VSETVL(n); if(gvl <= n/2){ - v_max = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i maxf) - maxf = *((FLOAT*)&v_res); + v0 = VFABS_FLOAT(v0, gvl); + v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); j += gvl; } }else{ @@ -169,94 +99,27 @@ asm volatile( BLASLONG stride_x = inc_x * sizeof(FLOAT); if(gvl <= n/2){ BLASLONG inc_xv = inc_x * gvl; - v_max = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i maxf) - maxf = *((FLOAT*)&v_res); + v0 = VFABS_FLOAT(v0, gvl); + v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); j += gvl; } } + + maxf = EXTRACT_FLOAT(v_res); return(maxf); } diff --git a/kernel/riscv64/amin_rvv.c b/kernel/riscv64/amin_rvv.c new file mode 100644 index 000000000..5186d7b12 --- /dev/null +++ b/kernel/riscv64/amin_rvv.c @@ -0,0 +1,102 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m8_tu +#define VFABSV_FLOAT __riscv_vfabs_v_f32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m8_tu +#define VFABSV_FLOAT __riscv_vfabs_v_f64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + FLOAT_V_T vx, vmin; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmin = VFMVVF_FLOAT(FLT_MAX, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vx = VFABSV_FLOAT(vx, vl); + vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vx = VFABSV_FLOAT(vx, vl); + vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl); + } + + } + + v_res = VFREDMINVS_FLOAT(vmin, v_res, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + + return(minf); +} diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c index f9b7defae..c4578eabf 100644 --- a/kernel/riscv64/amin_vector.c +++ b/kernel/riscv64/amin_vector.c @@ -26,232 +26,108 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" -#include -#include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMINVV_FLOAT vfmin_vv_f32m8 +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define ABS fabs +# else +# define ELEN 32 +# define ABS fabsf +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMINVV_FLOAT vfmin_vv_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define ABS fabs +# else +# define ELEN 32 +# define ABS fabsf +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMINVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) +#else +#define VFREDMINVS_FLOAT JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#endif +#define VFABS_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; - if (n <= 0 || inc_x <= 0) return(0.0); - FLOAT minf=FLT_MAX; + BLASLONG i=0, j=0; + BLASLONG ix=0; + FLOAT minf=0.0; + if (n <= 0 || inc_x <= 0) return(minf); + + minf = ABS(*x); + x += inc_x; + --n; + if (n == 0) return(minf); + unsigned int gvl = 0; - FLOAT_V_T v0, v1, v_min; - FLOAT_V_T_M1 v_res, v_max; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + FLOAT_V_T v0, v1; + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(minf, 1); - MASK_T mask0, mask1; - FLOAT zero = 0.0; if(inc_x == 1){ gvl = VSETVL(n); if(gvl <= n/2){ - v_min = VFMVVF_FLOAT(FLT_MAX, gvl); for(i=0,j=0; i 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vx = VFABSV_FLOAT(vx, vl); + vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vx = VFABSV_FLOAT(vx, vl); + vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl); + } + + } + + v_res = VFREDSUMVS_FLOAT(vsum, v_res, vlmax); + asumf = VFMVFS_FLOAT_M1(v_res); + return(asumf); +} diff --git a/kernel/riscv64/asum_vector.c b/kernel/riscv64/asum_vector.c index fc73362bc..a652eafdd 100644 --- a/kernel/riscv64/asum_vector.c +++ b/kernel/riscv64/asum_vector.c @@ -28,111 +28,101 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDSUMVS_FLOAT vfredosum_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFADDVV_FLOAT vfadd_vv_f32m8 +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFADDVV_FLOAT vfadd_vv_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #endif + +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUMVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) +#else +#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#endif +#define VFABS_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) +#define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0, j=0; - BLASLONG ix=0; FLOAT asumf=0.0; if (n <= 0 || inc_x <= 0) return(asumf); unsigned int gvl = 0; - FLOAT_V_T v0, v1, v_zero,v_sum; - FLOAT_V_T_M1 v_res, v_z0; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT_V_T v0, v1, v_sum; + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(0, 1); - MASK_T mask0, mask1; if(inc_x == 1){ gvl = VSETVL(n); - v_zero = VFMVVF_FLOAT(0, gvl); if(gvl <= n/2){ v_sum = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i 0; n -= vl, y += vl*inc_y) { + vl = VSETVL(n); + VSSEV_FLOAT(y, stride_y, vy, vl); + } + } + + } else { + if ((1 == inc_x) && (1 == inc_y)) { + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL(n); + vx = VLEV_FLOAT(x, vl); + vy = VFMULVF_FLOAT(vx, alpha, vl); + VSEV_FLOAT (y, vy, vl); + } + } else if (1 == inc_x) { + BLASLONG stride_y = inc_y * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + vx = VLEV_FLOAT(x, vl); + vy = VFMULVF_FLOAT(vx, alpha, vl); + VSSEV_FLOAT (y, stride_y, vy, vl); + } + } else if (1 == inc_y) { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VFMULVF_FLOAT(vx, alpha, vl); + VSEV_FLOAT (y, vy, vl); + } + } else { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VFMULVF_FLOAT(vx, alpha, vl); + VSSEV_FLOAT (y, stride_y, vy, vl); + } + } + } + + } else { + if ( alpha == 0.0 ) { + if (1 == inc_y) { + for (size_t vl; n > 0; n -= vl, y += vl) { + vl = VSETVL(n); + vy = VLEV_FLOAT(y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + VSEV_FLOAT (y, vy, vl); + } + } else { + BLASLONG stride_y = inc_y * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, y += vl*inc_y) { + vl = VSETVL(n); + vy = VLSEV_FLOAT(y, stride_y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + VSSEV_FLOAT (y, stride_y, vy, vl); + } + } + + } else { + if ((1 == inc_x) && (1 == inc_y)) { + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL(n); + vx = VLEV_FLOAT(x, vl); + vy = VLEV_FLOAT(y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); + VSEV_FLOAT (y, vy, vl); + } + } else if (1 == inc_x) { + BLASLONG stride_y = inc_y * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + vx = VLEV_FLOAT(x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); + VSSEV_FLOAT (y, stride_y, vy, vl); + } + } else if (1 == inc_y) { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLEV_FLOAT(y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); + VSEV_FLOAT (y, vy, vl); + } + } else { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); + VSSEV_FLOAT (y, stride_y, vy, vl); + } + } + } + } + + return(0); +} diff --git a/kernel/riscv64/axpby_vector.c b/kernel/riscv64/axpby_vector.c index 676dfd474..721aad2b0 100644 --- a/kernel/riscv64/axpby_vector.c +++ b/kernel/riscv64/axpby_vector.c @@ -27,31 +27,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle32_v_f32m4 -#define VLSEV_FLOAT vlse32_v_f32m4 -#define VSEV_FLOAT vse32_v_f32m4 -#define VSSEV_FLOAT vsse32_v_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMULVF_FLOAT vfmul_vf_f32m4 +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #else -#define VSETVL(n) vsetvl_e64m4(n) -#define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle64_v_f64m4 -#define VLSEV_FLOAT vlse64_v_f64m4 -#define VSEV_FLOAT vse64_v_f64m4 -#define VSSEV_FLOAT vsse64_v_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMULVF_FLOAT vfmul_vf_f64m4 +# define LMUL m4 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) +#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) +#define VFMACCVF_FLOAT JOIN(RISCV_RVV(vfmacc), _vf_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMULVF_FLOAT JOIN(RISCV_RVV(vfmul), _vf_f, ELEN, LMUL, _) + int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) { - if (n < 0) return(0); + if (n <= 0) return(0); BLASLONG i=0, j=0; unsigned int gvl = 0; @@ -60,6 +69,63 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * BLASLONG stride_x, stride_y, ix = 0, iy = 0; + if (inc_x == 0 || inc_y == 0) { /* use trivial non-vectorized loop if either increment is zero */ + + if ( beta == 0.0 ) + { + + if ( alpha == 0.0 ) + { + while(i < n) + { + y[iy] = 0.0 ; + iy += inc_y ; + i++ ; + } + } + else + { + while(i < n) + { + y[iy] = alpha * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + } + + + } + + } + else + { + + if ( alpha == 0.0 ) + { + while(i < n) + { + y[iy] = beta * y[iy] ; + iy += inc_y ; + i++ ; + } + } + else + { + while(i < n) + { + y[iy] = alpha * x[ix] + beta * y[iy] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + } + } + + } + + return(0); + + } else { /* vectorized approach for non-zero increments */ + if(beta == 0.0){ if(alpha == 0.0){//alpha == 0 && beta == 0 if(inc_y == 1){ @@ -372,5 +438,6 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * } } return(0); + } } diff --git a/kernel/riscv64/axpy.c b/kernel/riscv64/axpy.c index fb1094dd9..19d12ad3f 100644 --- a/kernel/riscv64/axpy.c +++ b/kernel/riscv64/axpy.c @@ -42,7 +42,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS BLASLONG i=0; BLASLONG ix,iy; - if ( n < 0 ) return(0); + if ( n <= 0 ) return(0); if ( da == 0.0 ) return(0); ix = 0; diff --git a/kernel/riscv64/axpy_rvv.c b/kernel/riscv64/axpy_rvv.c new file mode 100644 index 000000000..8bc2f30de --- /dev/null +++ b/kernel/riscv64/axpy_rvv.c @@ -0,0 +1,109 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 +#else +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + if ( n <= 0 ) return(0); + if ( da == 0.0 ) return(0); + + FLOAT_V_T vx, vy; + + if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLEV_FLOAT(y, vl); + vy = VFMACCVF_FLOAT(vy, da, vx, vl); + VSEV_FLOAT (y, vy, vl); + } + + } else if (1 == inc_y) { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLEV_FLOAT(y, vl); + vy = VFMACCVF_FLOAT(vy, da, vx, vl); + VSEV_FLOAT(y, vy, vl); + } + + } else if (1 == inc_x) { + + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + vy = VFMACCVF_FLOAT(vy, da, vx, vl); + VSSEV_FLOAT(y, stride_y, vy, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + vy = VFMACCVF_FLOAT(vy, da, vx, vl); + VSSEV_FLOAT(y, stride_y, vy, vl); + } + + } + + return(0); +} diff --git a/kernel/riscv64/axpy_vector.c b/kernel/riscv64/axpy_vector.c index 6f921f2d6..6dffe5f09 100644 --- a/kernel/riscv64/axpy_vector.c +++ b/kernel/riscv64/axpy_vector.c @@ -25,26 +25,38 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ + #include "common.h" -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle32_v_f32m4 -#define VLSEV_FLOAT vlse32_v_f32m4 -#define VSEV_FLOAT vse32_v_f32m4 -#define VSSEV_FLOAT vsse32_v_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #else -#define VSETVL(n) vsetvl_e64m4(n) -#define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle64_v_f64m4 -#define VLSEV_FLOAT vlse64_v_f64m4 -#define VSEV_FLOAT vse64_v_f64m4 -#define VSSEV_FLOAT vsse64_v_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +# define LMUL m4 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) +#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) +#define VFMACCVF_FLOAT JOIN(RISCV_RVV(vfmacc), _vf_f, ELEN, LMUL, _) + int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0, j=0, jx=0, jy=0; @@ -53,7 +65,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS FLOAT_V_T vy0, vy1; BLASLONG stride_x, stride_y; - if (n < 0) return(0); + if (n <= 0) return(0); if (da == 0.0) return(0); if (inc_x == 1 && inc_y == 1) { diff --git a/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c b/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c new file mode 100644 index 000000000..bd615389c --- /dev/null +++ b/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c @@ -0,0 +1,996 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=2 + M=8 + M_tail_scalar_from=2 + N=4 + __riscv_='__riscv_' + complex=True + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='gemm' + param_precision='float' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=32 + ELEN_PARAM=32 + LMUL_ACC=2 + VFMACC='__riscv_vfmacc_vf_f32m2' + VFMUL='__riscv_vfmul_vf_f32m2' + VLEV='__riscv_vle32_v_f32m2' + VLSEV='__riscv_vlse32_v_f32m2' + VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2' + VMUL_TO_ACC='__riscv_vfmul_vf_f32m2' + VSETVL='__riscv_vsetvl_e32m2' + VSEV='__riscv_vse32_v_f32m2' + VSSEV='__riscv_vsse32_v_f32m2' + acc_vector_t='vfloat32m2_t' + output='cgemm_kernel_8x4_zvl128b.c' + param_scalar_t='float' + param_vector_t='vfloat32m2_t' + +*/ + +#include "common.h" + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define S0 1 +#define S1 -1 +#define S2 1 +#define S3 1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfmacc +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define S0 1 +#define S1 1 +#define S2 1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfmsac +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define S0 1 +#define S1 1 +#define S2 -1 +#define S3 1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfnmsac +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define S0 1 +#define S1 -1 +#define S2 -1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfnmacc +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 4; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e32m2(8); + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + float B2r = B[bi + 2 * 2 + 0]; + float B2i = B[bi + 2 * 2 + 1]; + float B3r = B[bi + 3 * 2 + 0]; + float B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k + // leaving 6 vector registers for temporaries + // performing 2 operations between reuses of temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + vfloat32m2_t ACC2r = tmp0r; + vfloat32m2_t ACC2i = tmp0i; + vfloat32m2_t ACC3r = tmp1r; + vfloat32m2_t ACC3i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + B2r = B[bi + 2 * 2 + 0]; + B2i = B[bi + 2 * 2 + 1]; + B3r = B[bi + 3 * 2 + 0]; + B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); + ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); + ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); + ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl); + C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl); + C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl); + C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); + C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); + C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); + C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); + + m_top += 8; + } + + // -- tails for main pass + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + float B2r = B[bi + 2 * 2 + 0]; + float B2i = B[bi + 2 * 2 + 1]; + float B3r = B[bi + 3 * 2 + 0]; + float B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k + // leaving 6 vector registers for temporaries + // performing 2 operations between reuses of temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + vfloat32m2_t ACC2r = tmp0r; + vfloat32m2_t ACC2i = tmp0i; + vfloat32m2_t ACC3r = tmp1r; + vfloat32m2_t ACC3i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + B2r = B[bi + 2 * 2 + 0]; + B2i = B[bi + 2 * 2 + 1]; + B3r = B[bi + 3 * 2 + 0]; + B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); + ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); + ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); + ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl); + C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl); + C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl); + C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); + C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); + C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); + C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); + + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + float result8 = 0; + float result9 = 0; + float result10 = 0; + float result11 = 0; + float result12 = 0; + float result13 = 0; + float result14 = 0; + float result15 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1]; + result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1]; + result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1]; + result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1]; + ai += 2 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; + Cr += result8 * alphar; + Ci += result9 * alphar; + Cr -= result9 * alphai; + Ci += result8 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 1) * 2 + 1]; + Cr += result10 * alphar; + Ci += result11 * alphar; + Cr -= result11 * alphai; + Ci += result10 * alphai; + C[(ci + 2 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; + Cr += result12 * alphar; + Ci += result13 * alphar; + Cr -= result13 * alphai; + Ci += result12 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 1) * 2 + 1]; + Cr += result14 * alphar; + Ci += result15 * alphar; + Cr -= result15 * alphai; + Ci += result14 * alphai; + C[(ci + 3 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + ai += 1 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k + // leaving 10 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k + // leaving 10 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + ai += 2 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + ai += 1 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k + // leaving 12 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k + // leaving 12 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + ai += 2 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + ai += 1 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/cgemm_kernel_8x8_zvl256b.c b/kernel/riscv64/cgemm_kernel_8x8_zvl256b.c new file mode 100644 index 000000000..7980c029a --- /dev/null +++ b/kernel/riscv64/cgemm_kernel_8x8_zvl256b.c @@ -0,0 +1,1931 @@ +/* + +AUTOGENERATED KERNEL +Settings: + LMUL=1 + M=8 + M_tail_scalar_from=1 + N=8 + __riscv_='__riscv_' + complex=True + conjugate=False + cpu='zvl256b' + force_acc_double=False + index_type='BLASLONG' + op='gemm' + param_precision='float' + reg_width_bits=256 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=32 + ELEN_PARAM=32 + LMUL_ACC=1 + VFMACC='__riscv_vfmacc_vf_f32m1' + VFMUL='__riscv_vfmul_vf_f32m1' + VLEV='__riscv_vle32_v_f32m1' + VLSEV='__riscv_vlse32_v_f32m1' + VMACC_TO_ACC='__riscv_vfmacc_vf_f32m1' + VMUL_TO_ACC='__riscv_vfmul_vf_f32m1' + VSETVL='__riscv_vsetvl_e32m1' + VSEV='__riscv_vse32_v_f32m1' + VSSEV='__riscv_vsse32_v_f32m1' + acc_vector_t='vfloat32m1_t' + output='cgemm_kernel_8x8_zvl256b.c' + param_scalar_t='float' + param_vector_t='vfloat32m1_t' + +*/ + +#include "common.h" + + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + #define S0 1 + #define S1 -1 + #define S2 1 + #define S3 1 + #define VFMACC_RR __riscv_vfmsac + #define VFMACC_RI __riscv_vfmacc +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + #define S0 1 + #define S1 1 + #define S2 1 + #define S3 -1 + #define VFMACC_RR __riscv_vfmacc + #define VFMACC_RI __riscv_vfmsac +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + #define S0 1 + #define S1 1 + #define S2 -1 + #define S3 1 + #define VFMACC_RR __riscv_vfmacc + #define VFMACC_RI __riscv_vfnmsac +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + #define S0 1 + #define S1 -1 + #define S2 -1 + #define S3 -1 + #define VFMACC_RR __riscv_vfmsac + #define VFMACC_RI __riscv_vfnmacc +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + + // -- MAIN PASS + + for (BLASLONG j=0; j 0; n -= vl, x += vl, y += vl) { + vl = VSETVL(n); + v0 = VLEV_FLOAT(x, vl); + VSEV_FLOAT(y, v0, vl); + } + + } else if (inc_y == 1) { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + v0 = VLSEV_FLOAT(x, stride_x, vl); + VSEV_FLOAT(y, v0, vl); + } + + } else if(inc_x == 1) { + + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + v0 = VLEV_FLOAT(x, vl); + VSSEV_FLOAT(y, stride_y, v0, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + v0 = VLSEV_FLOAT(x, stride_x, vl); + VSSEV_FLOAT(y, stride_y, v0, vl); + } + + } + + return(0); +} diff --git a/kernel/riscv64/copy_vector.c b/kernel/riscv64/copy_vector.c index fee5e195d..ccbd6e482 100644 --- a/kernel/riscv64/copy_vector.c +++ b/kernel/riscv64/copy_vector.c @@ -25,22 +25,35 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) +#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) + int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i=0, j=0; @@ -58,7 +71,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) stride_x = inc_x * sizeof(FLOAT); if(gvl <= n/4){ BLASLONG inc_xv = inc_x * gvl; - BLASLONG gvl3 = gvl * 3; + unsigned int gvl3 = gvl * 3; BLASLONG inc_xv3 = inc_xv * 3; for(i=0,j=0; i 0; n -= vl, x += vl, y += vl) { + vl = __riscv_vsetvl_e64m8(n); + +#if !defined(DOUBLE) + vfloat32m4_t vx = __riscv_vle32_v_f32m4(x, vl); + vfloat32m4_t vy = __riscv_vle32_v_f32m4(y, vl); + + vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl); +#else + vfloat64m8_t vx = __riscv_vle64_v_f64m8(x, vl); + vfloat64m8_t vy = __riscv_vle64_v_f64m8(y, vl); + + vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl); +#endif + } + + } else if (1 == inc_x) { + + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = __riscv_vsetvl_e64m8(n); + +#if !defined(DOUBLE) + vfloat32m4_t vx = __riscv_vle32_v_f32m4(x, vl); + vfloat32m4_t vy = __riscv_vlse32_v_f32m4(y, stride_y, vl); + + vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl); +#else + vfloat64m8_t vx = __riscv_vle64_v_f64m8(x, vl); + vfloat64m8_t vy = __riscv_vlse64_v_f64m8(y, stride_y, vl); + + vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl); +#endif + } + } else if (1 == inc_y) { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = __riscv_vsetvl_e64m8(n); + +#if !defined(DOUBLE) + vfloat32m4_t vx = __riscv_vlse32_v_f32m4(x, stride_x, vl); + vfloat32m4_t vy = __riscv_vle32_v_f32m4(y, vl); + + vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl); +#else + vfloat64m8_t vx = __riscv_vlse64_v_f64m8(x, stride_x, vl); + vfloat64m8_t vy = __riscv_vle64_v_f64m8(y, vl); + + vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl); +#endif + } + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = __riscv_vsetvl_e64m8(n); + +#if !defined(DOUBLE) + vfloat32m4_t vx = __riscv_vlse32_v_f32m4(x, stride_x, vl); + vfloat32m4_t vy = __riscv_vlse32_v_f32m4(y, stride_y, vl); + + vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl); +#else + vfloat64m8_t vx = __riscv_vlse64_v_f64m8(x, stride_x, vl); + vfloat64m8_t vy = __riscv_vlse64_v_f64m8(y, stride_y, vl); + + vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl); +#endif + } + } + + vfloat64m1_t vec_zero = __riscv_vfmv_v_f_f64m1(0, vlmax); + vfloat64m1_t vec_sum = __riscv_vfredusum_vs_f64m8_f64m1(vr, vec_zero, vlmax); + dot = __riscv_vfmv_f_s_f64m1_f64(vec_sum); + + return(dot); +} diff --git a/kernel/riscv64/dot_vector.c b/kernel/riscv64/dot_vector.c index cc27d68ed..38ccc4778 100644 --- a/kernel/riscv64/dot_vector.c +++ b/kernel/riscv64/dot_vector.c @@ -27,31 +27,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle32_v_f32m4 -#define VLSEV_FLOAT vlse32_v_f32m4 -#define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFDOTVV_FLOAT vfdot_vv_f32m4 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m4_f32m1(v_res, va, vb, gvl) #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f32m4) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle64_v_f64m4 -#define VLSEV_FLOAT vlse64_v_f64m4 -#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFDOTVV_FLOAT vfdot_vv_f64m4 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m4_f64m1(v_res, va, vb, gvl) +#else +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f64m4) #endif #if defined(DSDOT) @@ -82,8 +88,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) j += gvl; } if(j > 0){ - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } //tail if(j < n){ @@ -93,13 +99,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } }else if(inc_y == 1){ gvl = VSETVL(n); vr = VFMVVF_FLOAT(0, gvl); - int stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_x = inc_x * sizeof(FLOAT); for(i=0,j=0; i 0){ - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); - + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } //tail if(j < n){ @@ -119,14 +124,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); - + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } }else if(inc_x == 1){ gvl = VSETVL(n); vr = VFMVVF_FLOAT(0, gvl); - int stride_y = inc_y * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); for(i=0,j=0; i 0){ - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); - + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } //tail if(j < n){ @@ -146,15 +149,14 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); - + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } }else{ gvl = VSETVL(n); vr = VFMVVF_FLOAT(0, gvl); - int stride_x = inc_x * sizeof(FLOAT); - int stride_y = inc_y * sizeof(FLOAT); + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); for(i=0,j=0; i 0){ - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); - + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } //tail if(j < n){ @@ -174,9 +175,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); - + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } } return(dot); diff --git a/kernel/riscv64/dsdot_vector.c b/kernel/riscv64/dsdot_vector.c new file mode 100644 index 000000000..e972828b5 --- /dev/null +++ b/kernel/riscv64/dsdot_vector.c @@ -0,0 +1,152 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0, j=0; + double dot = 0.0 ; + + if ( n < 1 ) return(dot); + vfloat64m4_t vr; + vfloat32m2_t vx, vy; + unsigned int gvl = 0; + vfloat64m1_t v_res, v_z0; + gvl = vsetvlmax_e64m1(); + v_res = vfmv_v_f_f64m1(0, gvl); + v_z0 = vfmv_v_f_f64m1(0, gvl); + + if(inc_x == 1 && inc_y == 1){ + gvl = vsetvl_e64m4(n); + vr = vfmv_v_f_f64m4(0, gvl); + for(i=0,j=0; i 0){ + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + } + //tail + if(j < n){ + gvl = vsetvl_e64m4(n-j); + vx = vle32_v_f32m2(&x[j], gvl); + vy = vle32_v_f32m2(&y[j], gvl); + vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); + //vr = vfdot_vv_f32m2(vx, vy, gvl); + vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + } + }else if(inc_y == 1){ + gvl = vsetvl_e64m4(n); + vr = vfmv_v_f_f64m4(0, gvl); + int stride_x = inc_x * sizeof(FLOAT); + for(i=0,j=0; i 0){ + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + //tail + if(j < n){ + gvl = vsetvl_e64m4(n-j); + vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); + vy = vle32_v_f32m2(&y[j], gvl); + vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); + //vr = vfdot_vv_f32m2(vx, vy, gvl); + vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + }else if(inc_x == 1){ + gvl = vsetvl_e64m4(n); + vr = vfmv_v_f_f64m4(0, gvl); + int stride_y = inc_y * sizeof(FLOAT); + for(i=0,j=0; i 0){ + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + //tail + if(j < n){ + gvl = vsetvl_e64m4(n-j); + vx = vle32_v_f32m2(&x[j], gvl); + vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); + vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); + //vr = vfdot_vv_f32m2(vx, vy, gvl); + vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + }else{ + gvl = vsetvl_e64m4(n); + vr = vfmv_v_f_f64m4(0, gvl); + int stride_x = inc_x * sizeof(FLOAT); + int stride_y = inc_y * sizeof(FLOAT); + for(i=0,j=0; i 0){ + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + //tail + if(j < n){ + gvl = vsetvl_e64m4(n-j); + vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); + vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); + vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); + //vr = vfdot_vv_f32m2(vx, vy, gvl); + vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + } + return(dot); +} diff --git a/kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c b/kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c new file mode 100644 index 000000000..c1e0da86e --- /dev/null +++ b/kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c @@ -0,0 +1,660 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=4 + M=8 + M_tail_scalar_from=2 + N=4 + __riscv_='__riscv_' + complex=False + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='trmm' + param_precision='double' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=64 + ELEN_PARAM=64 + LMUL_ACC=4 + VFMACC='__riscv_vfmacc_vf_f64m4' + VFMUL='__riscv_vfmul_vf_f64m4' + VLEV='__riscv_vle64_v_f64m4' + VLSEV='__riscv_vlse64_v_f64m4' + VMACC_TO_ACC='__riscv_vfmacc_vf_f64m4' + VMUL_TO_ACC='__riscv_vfmul_vf_f64m4' + VSETVL='__riscv_vsetvl_e64m4' + VSEV='__riscv_vse64_v_f64m4' + VSSEV='__riscv_vsse64_v_f64m4' + acc_vector_t='vfloat64m4_t' + output='dtrmm_kernel_8x4_zvl128b.c' + param_scalar_t='double' + param_vector_t='vfloat64m4_t' + +*/ + +#include "common.h" + +#if defined(LEFT) != defined(TRANSA) +#define BACKWARDS +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 4; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e64m4(8); + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 4; +#endif +#endif + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + double B2 = B[bi + 2]; + double B3 = B[bi + 3]; + bi += 4; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); + vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); + vfloat64m4_t c2 = __riscv_vfmul_vf_f64m4(result2, alpha, gvl); + vfloat64m4_t c3 = __riscv_vfmul_vf_f64m4(result3, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c3, gvl); + m_top += 8; + } + + // -- tails for main pass + + if (M & 4) { + gvl = __riscv_vsetvl_e64m4(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 4; +#endif +#endif + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + double B2 = B[bi + 2]; + double B3 = B[bi + 3]; + bi += 4; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); + vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); + vfloat64m4_t c2 = __riscv_vfmul_vf_f64m4(result2, alpha, gvl); + vfloat64m4_t c3 = __riscv_vfmul_vf_f64m4(result3, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c3, gvl); + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + result4 += A[ai + 0] * B[bi + 2]; + result5 += A[ai + 1] * B[bi + 2]; + result6 += A[ai + 0] * B[bi + 3]; + result7 += A[ai + 1] * B[bi + 3]; + ai += 2; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + C[ci + 1 * ldc + 0] = alpha * result2; + C[ci + 1 * ldc + 1] = alpha * result3; + C[ci + 2 * ldc + 0] = alpha * result4; + C[ci + 2 * ldc + 1] = alpha * result5; + C[ci + 3 * ldc + 0] = alpha * result6; + C[ci + 3 * ldc + 1] = alpha * result7; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + result2 += A[ai + 0] * B[bi + 2]; + result3 += A[ai + 0] * B[bi + 3]; + ai += 1; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 1 * ldc + 0] = alpha * result1; + C[ci + 2 * ldc + 0] = alpha * result2; + C[ci + 3 * ldc + 0] = alpha * result3; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e64m4(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 2; +#endif +#endif + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + bi += 2; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e64m4(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 2; +#endif +#endif + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + bi += 2; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + ai += 2; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + C[ci + 1 * ldc + 0] = alpha * result2; + C[ci + 1 * ldc + 1] = alpha * result3; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + ai += 1; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 1 * ldc + 0] = alpha * result1; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e64m4(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 1; +#endif +#endif + double B0 = B[bi + 0]; + bi += 1; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e64m4(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 1; +#endif +#endif + double B0 = B[bi + 0]; + bi += 1; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + ai += 2; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + ai += 1; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c b/kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c new file mode 100644 index 000000000..b1739f248 --- /dev/null +++ b/kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c @@ -0,0 +1,1068 @@ +/* + +AUTOGENERATED KERNEL +Settings: + LMUL=1 + M=8 + M_tail_scalar_from=2 + N=8 + __riscv_='__riscv_' + complex=False + conjugate=False + cpu='zvl256b' + force_acc_double=False + index_type='BLASLONG' + op='trmm' + param_precision='double' + reg_width_bits=256 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=64 + ELEN_PARAM=64 + LMUL_ACC=1 + VFMACC='__riscv_vfmacc_vf_f64m1' + VFMUL='__riscv_vfmul_vf_f64m1' + VLEV='__riscv_vle64_v_f64m1' + VLSEV='__riscv_vlse64_v_f64m1' + VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1' + VMUL_TO_ACC='__riscv_vfmul_vf_f64m1' + VSETVL='__riscv_vsetvl_e64m1' + VSEV='__riscv_vse64_v_f64m1' + VSSEV='__riscv_vsse64_v_f64m1' + acc_vector_t='vfloat64m1_t' + output='dtrmm_kernel_8x8_zvl256b.c' + param_scalar_t='double' + param_vector_t='vfloat64m1_t' + +*/ + +#include "common.h" + + + +#if defined(LEFT) != defined(TRANSA) + #define BACKWARDS +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc, BLASLONG offset) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + + // -- MAIN PASS + + for (BLASLONG j=0; j 0; n--, c += ldc) { + c_offset = c; + + for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl) { + vl = VSETVL(chunk); + + VSEV_FLOAT(c_offset, vx, vl); + } + } + + } else { + + for( ; n > 0; n--, c += ldc) { + c_offset = c; + + for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl) { + vl = VSETVL(chunk); + + vx = VLEV_FLOAT(c_offset, vl); + vx = VFMULVF_FLOAT(vx, beta, vl); + VSEV_FLOAT(c_offset, vx, vl); + } + } + + } + + return 0; +} diff --git a/kernel/riscv64/gemm_ncopy_8_rvv.c b/kernel/riscv64/gemm_ncopy_8_rvv.c new file mode 100644 index 000000000..c652ab0c0 --- /dev/null +++ b/kernel/riscv64/gemm_ncopy_8_rvv.c @@ -0,0 +1,197 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define FLOAT_VX2_T vfloat32m1x2_t +#define FLOAT_VX4_T vfloat32m1x4_t +#define FLOAT_VX8_T vfloat32m1x8_t +#define VSET_VX2 __riscv_vset_v_f32m1_f32m1x2 +#define VSET_VX4 __riscv_vset_v_f32m1_f32m1x4 +#define VSET_VX8 __riscv_vset_v_f32m1_f32m1x8 +#define VLEV_FLOAT __riscv_vle32_v_f32m1 +#define VSEV_FLOAT __riscv_vse32_v_f32m1 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8 +#else +#define VSETVL(n) __riscv_vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define FLOAT_VX2_T vfloat64m1x2_t +#define FLOAT_VX4_T vfloat64m1x4_t +#define FLOAT_VX8_T vfloat64m1x8_t +#define VSET_VX2 __riscv_vset_v_f64m1_f64m1x2 +#define VSET_VX4 __riscv_vset_v_f64m1_f64m1x4 +#define VSET_VX8 __riscv_vset_v_f64m1_f64m1x8 +#define VLEV_FLOAT __riscv_vle64_v_f64m1 +#define VSEV_FLOAT __riscv_vse64_v_f64m1 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8 +#endif + +// Optimizes the implementation in ../generic/gemm_ncopy_8.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) +{ + BLASLONG i, j; + + FLOAT *a_offset; + FLOAT *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *a_offset5, *a_offset6, *a_offset7, *a_offset8; + FLOAT *b_offset; + + FLOAT_V_T v1, v2, v3, v4, v5, v6, v7, v8; + FLOAT_VX2_T vx2; + FLOAT_VX4_T vx4; + FLOAT_VX8_T vx8; + + size_t vl; + + //fprintf(stderr, "gemm_ncopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda); + + a_offset = a; + b_offset = b; + + for(j = (n >> 3); j > 0; j--) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset5 = a_offset4 + lda; + a_offset6 = a_offset5 + lda; + a_offset7 = a_offset6 + lda; + a_offset8 = a_offset7 + lda; + a_offset += 8 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + v3 = VLEV_FLOAT(a_offset3, vl); + v4 = VLEV_FLOAT(a_offset4, vl); + v5 = VLEV_FLOAT(a_offset5, vl); + v6 = VLEV_FLOAT(a_offset6, vl); + v7 = VLEV_FLOAT(a_offset7, vl); + v8 = VLEV_FLOAT(a_offset8, vl); + + vx8 = VSET_VX8(vx8, 0, v1); + vx8 = VSET_VX8(vx8, 1, v2); + vx8 = VSET_VX8(vx8, 2, v3); + vx8 = VSET_VX8(vx8, 3, v4); + vx8 = VSET_VX8(vx8, 4, v5); + vx8 = VSET_VX8(vx8, 5, v6); + vx8 = VSET_VX8(vx8, 6, v7); + vx8 = VSET_VX8(vx8, 7, v8); + + VSSEG8_FLOAT(b_offset, vx8, vl); + + a_offset1 += vl; + a_offset2 += vl; + a_offset3 += vl; + a_offset4 += vl; + a_offset5 += vl; + a_offset6 += vl; + a_offset7 += vl; + a_offset8 += vl; + b_offset += vl*8; + } + } + + if (n & 4) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + v3 = VLEV_FLOAT(a_offset3, vl); + v4 = VLEV_FLOAT(a_offset4, vl); + + vx4 = VSET_VX4(vx4, 0, v1); + vx4 = VSET_VX4(vx4, 1, v2); + vx4 = VSET_VX4(vx4, 2, v3); + vx4 = VSET_VX4(vx4, 3, v4); + + VSSEG4_FLOAT(b_offset, vx4, vl); + + a_offset1 += vl; + a_offset2 += vl; + a_offset3 += vl; + a_offset4 += vl; + b_offset += vl*4; + } + } + + if (n & 2) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + + vx2 = VSET_VX2(vx2, 0, v1); + vx2 = VSET_VX2(vx2, 1, v2); + + VSSEG2_FLOAT(b_offset, vx2, vl); + + a_offset1 += vl; + a_offset2 += vl; + b_offset += vl*2; + } + } + + if (n & 1) { + a_offset1 = a_offset; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + + VSEV_FLOAT(b_offset, v1, vl); + + a_offset1 += vl; + b_offset += vl; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_ncopy_rvv_v1.c b/kernel/riscv64/gemm_ncopy_rvv_v1.c new file mode 100644 index 000000000..2d6db15e5 --- /dev/null +++ b/kernel/riscv64/gemm_ncopy_rvv_v1.c @@ -0,0 +1,76 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) +{ + BLASLONG i, j; + + FLOAT *a_offset; + FLOAT *a_offset1; + FLOAT *b_offset; + + FLOAT_V_T v0; + size_t vl; + + //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); + + a_offset = a; + b_offset = b; + + for(j = n; j > 0; j -= vl) { + vl = VSETVL(j); + + a_offset1 = a_offset; + a_offset += vl * lda; + + for(i = m; i > 0; i--) { + v0 = VLSEV_FLOAT(a_offset1, lda * sizeof(FLOAT), vl); + VSEV_FLOAT(b_offset, v0, vl); + + a_offset1++; + b_offset += vl; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_tcopy_8_rvv.c b/kernel/riscv64/gemm_tcopy_8_rvv.c new file mode 100644 index 000000000..4742ae6a7 --- /dev/null +++ b/kernel/riscv64/gemm_tcopy_8_rvv.c @@ -0,0 +1,273 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define FLOAT_VX2_T vfloat32m1x2_t +#define FLOAT_VX4_T vfloat32m1x4_t +#define FLOAT_VX8_T vfloat32m1x8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m1 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m1 +#define VSEV_FLOAT __riscv_vse32_v_f32m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2 +#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1x4 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4 +#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1x8 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8 +#else +#define VSETVL(n) __riscv_vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define FLOAT_VX2_T vfloat64m1x2_t +#define FLOAT_VX4_T vfloat64m1x4_t +#define FLOAT_VX8_T vfloat64m1x8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m1 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m1 +#define VSEV_FLOAT __riscv_vse64_v_f64m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2 +#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1x4 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4 +#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1x8 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8 +#endif + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1; + + IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; + + FLOAT_V_T v0; + FLOAT_VX2_T vx2; + FLOAT_VX4_T vx4; + FLOAT_VX8_T vx8; + + // fprintf(stderr, "gemm_tcopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda); + + aoffset = a; + boffset = b; + boffset2 = b + m * (n & ~7); + boffset3 = b + m * (n & ~3); + boffset4 = b + m * (n & ~1); + + for(j = (m >> 3); j > 0; j--) { + + aoffset1 = aoffset; + aoffset += 8 * lda; + + boffset1 = boffset; + boffset += 64; + + for(i = (n >> 3); i > 0; i--) { + size_t vl = 8; + + vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG8_FLOAT(boffset1, vx8, vl); + + aoffset1 += 8; + boffset1 += m * 8; + } + + if (n & 4) { + size_t vl = 8; + + vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG4_FLOAT(boffset2, vx4, vl); + + aoffset1 += 4; + boffset2 += 32; + } + + if (n & 2) { + size_t vl = 8; + + vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG2_FLOAT(boffset3, vx2, vl); + + aoffset1 += 2; + boffset3 += 16; + } + + if (n & 1) { + size_t vl = 8; + + v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSEV_FLOAT(boffset4, v0, vl); + + aoffset1 += 1; + boffset4 += 8; + } + + } + + if (m & 4) { + + aoffset1 = aoffset; + aoffset += 4 * lda; + + boffset1 = boffset; + boffset += 32; + + for(i = (n >> 3); i > 0; i--) { + size_t vl = 4; + + vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG8_FLOAT(boffset1, vx8, vl); + + aoffset1 += 8; + boffset1 += m * 8; + } + + if (n & 4) { + size_t vl = 4; + + vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG4_FLOAT(boffset2, vx4, vl); + + aoffset1 += 4; + boffset2 += 16; + } + + if (n & 2) { + size_t vl = 4; + + vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG2_FLOAT(boffset3, vx2, vl); + + aoffset1 += 2; + boffset3 += 8; + } + + if (n & 1) { + size_t vl = 4; + + v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSEV_FLOAT(boffset4, v0, vl); + + aoffset1 += 1; + boffset4 += 4; + } + } + + if (m & 2) { + aoffset1 = aoffset; + aoffset += 2 * lda; + + boffset1 = boffset; + boffset += 16; + + for(i = (n >> 3); i > 0; i--) { + size_t vl = 2; + + vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG8_FLOAT(boffset1, vx8, vl); + + aoffset1 += 8; + boffset1 += m * 8; + } + + if (n & 4) { + size_t vl = 2; + + vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG4_FLOAT(boffset2, vx4, vl); + + aoffset1 += 4; + boffset2 += 8; + } + + if (n & 2) { + size_t vl = 2; + + vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG2_FLOAT(boffset3, vx2, vl); + + aoffset1 += 2; + boffset3 += 4; + } + + if (n & 1) { + size_t vl = 2; + + v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSEV_FLOAT(boffset4, v0, vl); + + aoffset1 += 1; + boffset4 += 2; + } + } + + if (m & 1) { + aoffset1 = aoffset; + boffset1 = boffset; + + for(i = (n >> 3); i > 0; i--) { + size_t vl = 8; + + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset1, v0, vl); + + aoffset1 += 8; + boffset1 += 8 * m; + } + + if (n & 4) { + size_t vl = 4; + + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset2, v0, vl); + + aoffset1 += 4; + //boffset2 += 4; + } + + if (n & 2) { + size_t vl = 2; + + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset3, v0, vl); + + aoffset1 += 2; + // boffset3 += 2; + } + + if (n & 1) { + *(boffset4) = *(aoffset1); + // aoffset1 ++; + // boffset4 ++; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_tcopy_rvv_v1.c b/kernel/riscv64/gemm_tcopy_rvv_v1.c new file mode 100644 index 000000000..c5fb6479f --- /dev/null +++ b/kernel/riscv64/gemm_tcopy_rvv_v1.c @@ -0,0 +1,74 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1; + IFLOAT *boffset; + + FLOAT_V_T v0; + size_t vl; + + //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); + + aoffset = a; + boffset = b; + + for(j = n; j > 0; j -= vl) { + vl = VSETVL(j); + + aoffset1 = aoffset; + aoffset += vl; + + for(i = m; i > 0; i--) { + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset, v0, vl); + + aoffset1 += lda; + boffset += vl; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemmkernel_rvv_v1x8.c b/kernel/riscv64/gemmkernel_rvv_v1x8.c new file mode 100644 index 000000000..471b3158f --- /dev/null +++ b/kernel/riscv64/gemmkernel_rvv_v1x8.c @@ -0,0 +1,601 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#endif + +int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc +#ifdef TRMMKERNEL + ,BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7; + IFLOAT *ptrba,*ptrbb; + + //fprintf(stderr, "%s, bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", __FUNCTION__, bm, bn, bk, alpha, ldc); // Debug + + FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; + FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; + size_t vl; + + // N:8 + for (j = bn/8; j > 0; j--) { + C0 = C; + C1 = C0 + ldc; + C2 = C1 + ldc; + C3 = C2 + ldc; + C4 = C3 + ldc; + C5 = C4 + ldc; + C6 = C5 + ldc; + C7 = C6 + ldc; + ptrba = ba; + + for (i = bm; i > 0; i -= vl) { + vl = VSETVL(i); + + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + vres4 = VFMVVF_FLOAT(0.0, vl); + vres5 = VFMVVF_FLOAT(0.0, vl); + vres6 = VFMVVF_FLOAT(0.0, vl); + vres7 = VFMVVF_FLOAT(0.0, vl); +#if 0 + for (k = bk; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); + + ptrba += vl; + ptrbb += 8; + } +#else + // Unroll K + for (k = bk/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); + ptrbb += 8; + + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va1, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va1, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va1, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va1, vl); + ptrbb += 8; + + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va2, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va2, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va2, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va2, vl); + ptrbb += 8; + + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va3, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va3, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va3, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va3, vl); + ptrbb += 8; + + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va4, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va4, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va4, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va4, vl); + ptrbb += 8; + + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va5, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va5, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va5, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va5, vl); + ptrbb += 8; + + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va6, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va6, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va6, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va6, vl); + ptrbb += 8; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va7, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va7, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va7, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va7, vl); + ptrbb += 8; + } + + // K remainder + for (k = bk&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); + + ptrbb += 8; + ptrba += vl; + } +#endif + va0 = VLEV_FLOAT(C0, vl); + va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VLEV_FLOAT(C1, vl); + va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl); + VSEV_FLOAT(C1, va1, vl); + + va2 = VLEV_FLOAT(C2, vl); + va2 = VFMACCVF_FLOAT(va2, alpha, vres2, vl); + VSEV_FLOAT(C2, va2, vl); + + va3 = VLEV_FLOAT(C3, vl); + va3 = VFMACCVF_FLOAT(va3, alpha, vres3, vl); + VSEV_FLOAT(C3, va3, vl); + + va4 = VLEV_FLOAT(C4, vl); + va4 = VFMACCVF_FLOAT(va4, alpha, vres4, vl); + VSEV_FLOAT(C4, va4, vl); + + va5 = VLEV_FLOAT(C5, vl); + va5 = VFMACCVF_FLOAT(va5, alpha, vres5, vl); + VSEV_FLOAT(C5, va5, vl); + + va6 = VLEV_FLOAT(C6, vl); + va6 = VFMACCVF_FLOAT(va6, alpha, vres6, vl); + VSEV_FLOAT(C6, va6, vl); + + va7 = VLEV_FLOAT(C7, vl); + va7 = VFMACCVF_FLOAT(va7, alpha, vres7, vl); + VSEV_FLOAT(C7, va7, vl); + + C0 += vl; + C1 += vl; + C2 += vl; + C3 += vl; + C4 += vl; + C5 += vl; + C6 += vl; + C7 += vl; + } + + bb += (bk<<3); + C += (ldc<<3); + } + + // N:4 + if (bn & 4) { + C0 = C; + C1 = C0 + ldc; + C2 = C1 + ldc; + C3 = C2 + ldc; + ptrba = ba; + + for (i = bm; i > 0; i -= vl) { + vl = VSETVL(i); + + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + +#if 0 + for (k = bk; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + + ptrba += vl; + ptrbb += 4; + } +#else + // Unroll K + for (k = bk/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + ptrbb += 4; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); + ptrbb += 4; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); + ptrbb += 4; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); + ptrbb += 4; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); + ptrbb += 4; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); + ptrbb += 4; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); + ptrbb += 4; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); + ptrbb += 4; + } + + // K remainder + for (k = bk&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + + ptrbb += 4; + ptrba += vl; + } +#endif + va0 = VLEV_FLOAT(C0, vl); + va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VLEV_FLOAT(C1, vl); + va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl); + VSEV_FLOAT(C1, va1, vl); + + va2 = VLEV_FLOAT(C2, vl); + va2 = VFMACCVF_FLOAT(va2, alpha, vres2, vl); + VSEV_FLOAT(C2, va2, vl); + + va3 = VLEV_FLOAT(C3, vl); + va3 = VFMACCVF_FLOAT(va3, alpha, vres3, vl); + VSEV_FLOAT(C3, va3, vl); + + C0 += vl; + C1 += vl; + C2 += vl; + C3 += vl; + } + + bb += (bk<<2); + C += (ldc<<2); + } + + // N:2 + if (bn & 2) { + C0 = C; + C1 = C0 + ldc; + ptrba = ba; + + for (i = bm; i > 0; i -= vl) { + vl = VSETVL(i); + + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); +#if 0 + for (k = bk; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + + ptrba += vl; + ptrbb += 2; + } +#else + // Unroll K + for (k = bk/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + ptrbb += 2; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + ptrbb += 2; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + ptrbb += 2; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + ptrbb += 2; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + ptrbb += 2; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + ptrbb += 2; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + ptrbb += 2; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + ptrbb += 2; + } + + // K remainder + for (k = bk&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + + ptrbb += 2; + ptrba += vl; + } +#endif + va0 = VLEV_FLOAT(C0, vl); + va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VLEV_FLOAT(C1, vl); + va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl); + VSEV_FLOAT(C1, va1, vl); + + C0 += vl; + C1 += vl; + } + + bb += (bk<<1); + C += (ldc<<1); + } + + // N:1 + if (bn & 1) { + C0 = C; + ptrba = ba; + + for (i = bm; i > 0; i -= vl) { + vl = VSETVL(i); + + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); +#if 0 + for (k = bk; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + + ptrba += vl; + ptrbb += 1; + } +#else + // Unroll K + for (k = bk/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + ptrbb += 1; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + ptrbb += 1; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + ptrbb += 1; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + ptrbb += 1; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + ptrbb += 1; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + ptrbb += 1; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + ptrbb += 1; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + ptrbb += 1; + } + + // K remainder + for (k = bk&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + + ptrbb += 1; + ptrba += vl; + } +#endif + va0 = VLEV_FLOAT(C0, vl); + va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); + VSEV_FLOAT(C0, va0, vl); + + C0 += vl; + } + + bb += (bk); + C += (ldc); + } + + return 0; +} diff --git a/kernel/riscv64/gemv_n_rvv.c b/kernel/riscv64/gemv_n_rvv.c new file mode 100644 index 000000000..1366eb5ad --- /dev/null +++ b/kernel/riscv64/gemv_n_rvv.c @@ -0,0 +1,94 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 +#else +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + if(n < 0) return(0); + + FLOAT *a_ptr, *x_ptr; + BLASLONG i; + FLOAT_V_T va, vy; + + if(inc_y == 1) { + + for (size_t vl; m > 0; m -= vl, y += vl, a += vl) { + vl = VSETVL(m); + a_ptr = a; + x_ptr = x; + vy = VLEV_FLOAT(y, vl); + for(i = 0; i < n; i++) { + va = VLEV_FLOAT(a_ptr, vl); + vy = VFMACCVF_FLOAT(vy, (alpha * (*x_ptr)), va, vl); + + a_ptr += lda; + x_ptr += inc_x; + } + VSEV_FLOAT(y, vy, vl); + } + + } else { + + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; m > 0; m -= vl, y += vl*inc_y, a += vl) { + vl = VSETVL(m); + a_ptr = a; + x_ptr = x; + vy = VLSEV_FLOAT(y, stride_y, vl); + for(i = 0; i < n; i++) { + va = VLEV_FLOAT(a_ptr, vl); + vy = VFMACCVF_FLOAT(vy, (alpha * (*x_ptr)), va, vl); + + a_ptr += lda; + x_ptr += inc_x; + } + VSSEV_FLOAT(y, stride_y, vy, vl); + } + + } + return(0); +} diff --git a/kernel/riscv64/gemv_n_vector.c b/kernel/riscv64/gemv_n_vector.c index bb9ab8e5a..aa13fc87d 100644 --- a/kernel/riscv64/gemv_n_vector.c +++ b/kernel/riscv64/gemv_n_vector.c @@ -27,21 +27,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle32_v_f32m4 -#define VLSEV_FLOAT vlse32_v_f32m4 -#define VSEV_FLOAT vse32_v_f32m4 -#define VSSEV_FLOAT vsse32_v_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) #else -#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle64_v_f64m4 -#define VLSEV_FLOAT vlse64_v_f64m4 -#define VSEV_FLOAT vse64_v_f64m4 -#define VSSEV_FLOAT vsse64_v_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) diff --git a/kernel/riscv64/gemv_t_rvv.c b/kernel/riscv64/gemv_t_rvv.c new file mode 100644 index 000000000..9c859aa50 --- /dev/null +++ b/kernel/riscv64/gemv_t_rvv.c @@ -0,0 +1,118 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m8_tu +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m8_tu +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j; + FLOAT *a_ptr, *x_ptr; + + FLOAT_V_T va, vx, vr; + FLOAT_V_T_M1 v_res, v_z0; + size_t vlmax = VSETVL_MAX_M1; + v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + vlmax = VSETVL_MAX; + + if(inc_x == 1) { + + for(i = 0; i < n; i++) { + j = m; + a_ptr = a; + x_ptr = x; + vr = VFMVVF_FLOAT(0, vlmax); + + for (size_t vl; j > 0; j -= vl, a_ptr += vl, x_ptr += vl) { + vl = VSETVL(j); + + va = VLEV_FLOAT(a_ptr, vl); + vx = VLEV_FLOAT(x_ptr, vl); + vr = VFMACCVV_FLOAT_TU(vr, va, vx, vl); + } + + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); + *y += alpha * VFMVFS_FLOAT_M1(v_res); + y += inc_y; + a += lda; + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for(i = 0; i < n; i++) { + j = m; + a_ptr = a; + x_ptr = x; + vr = VFMVVF_FLOAT(0, vlmax); + + for (size_t vl; j > 0; j -= vl, a_ptr += vl, x_ptr += vl*inc_x) { + vl = VSETVL(j); + + va = VLEV_FLOAT(a_ptr, vl); + vx = VLSEV_FLOAT(x_ptr, stride_x, vl); + vr = VFMACCVV_FLOAT_TU(vr, va, vx, vl); + } + + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); + *y += alpha * VFMVFS_FLOAT_M1(v_res); + y += inc_y; + a += lda; + } + + } + + return(0); +} diff --git a/kernel/riscv64/gemv_t_vector.c b/kernel/riscv64/gemv_t_vector.c index 7d0b70cbb..62b85164c 100644 --- a/kernel/riscv64/gemv_t_vector.c +++ b/kernel/riscv64/gemv_t_vector.c @@ -27,107 +27,110 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m4_t +#define VSETVL(n) RISCV_RVV(vsetvl_e32m2)(n) +#define FLOAT_V_T vfloat32m2_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle32_v_f32m4 -#define VLSEV_FLOAT vlse32_v_f32m4 -#define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFDOTVV_FLOAT vfdot_vv_f32m4 -#define VFMULVV_FLOAT vfmul_vv_f32m4 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m2) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m2) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m2_f32m1(v_res, va, vb, gvl) #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m4_t +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m2_f32m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m2) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m2) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m2) +#define xint_t int +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m2)(n) +#define FLOAT_V_T vfloat64m2_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle64_v_f64m4 -#define VLSEV_FLOAT vlse64_v_f64m4 -#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFDOTVV_FLOAT vfdot_vv_f64m4 -#define VFMULVV_FLOAT vfmul_vv_f64m4 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m2) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m2) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m2_f64m1(v_res, va, vb, gvl) +#else +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m2_f64m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m2) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m2) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m2) +#define xint_t long long #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { - BLASLONG i = 0, j = 0, k = 0; - BLASLONG ix = 0, iy = 0; - FLOAT *a_ptr = a; + BLASLONG i = 0, j = 0, k = 0; + BLASLONG ix = 0, iy = 0; + FLOAT *a_ptr = a; FLOAT temp; FLOAT_V_T va, vr, vx; unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_z0; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT_V_T_M1 v_res; + if(inc_x == 1){ for(i = 0; i < n; i++){ + v_res = VFMVVF_FLOAT_M1(0, 1); gvl = VSETVL(m); j = 0; vr = VFMVVF_FLOAT(0, gvl); for(k = 0; k < m/gvl; k++){ va = VLEV_FLOAT(&a_ptr[j], gvl); vx = VLEV_FLOAT(&x[j], gvl); - vr = VFMACCVV_FLOAT(vr, va, vx, gvl); + vr = VFMULVV_FLOAT(va, vx, gvl); // could vfmacc here and reduce outside loop + v_res = VFREDSUM_FLOAT(vr, v_res, gvl); // but that reordering diverges far enough from scalar path to make tests fail j += gvl; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp = (FLOAT)VFMVFS_FLOAT(v_res); if(j < m){ gvl = VSETVL(m-j); va = VLEV_FLOAT(&a_ptr[j], gvl); vx = VLEV_FLOAT(&x[j], gvl); vr = VFMULVV_FLOAT(va, vx, gvl); - - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp += (FLOAT)VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_res, gvl); } + temp = (FLOAT)EXTRACT_FLOAT(v_res); y[iy] += alpha * temp; + + iy += inc_y; a_ptr += lda; } }else{ BLASLONG stride_x = inc_x * sizeof(FLOAT); - for(i = 0; i < n; i++){ + v_res = VFMVVF_FLOAT_M1(0, 1); gvl = VSETVL(m); - BLASLONG inc_xv = inc_x * gvl; j = 0; ix = 0; vr = VFMVVF_FLOAT(0, gvl); for(k = 0; k < m/gvl; k++){ va = VLEV_FLOAT(&a_ptr[j], gvl); vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); - vr = VFMACCVV_FLOAT(vr, va, vx, gvl); + vr = VFMULVV_FLOAT(va, vx, gvl); + v_res = VFREDSUM_FLOAT(vr, v_res, gvl); j += gvl; - ix += inc_xv; + ix += inc_x * gvl; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp = (FLOAT)VFMVFS_FLOAT(v_res); if(j < m){ gvl = VSETVL(m-j); va = VLEV_FLOAT(&a_ptr[j], gvl); vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); vr = VFMULVV_FLOAT(va, vx, gvl); - - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp += (FLOAT)VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_res, gvl); } + temp = (FLOAT)EXTRACT_FLOAT(v_res); y[iy] += alpha * temp; + + iy += inc_y; a_ptr += lda; } } + + return(0); } - diff --git a/kernel/riscv64/generate_kernel.py b/kernel/riscv64/generate_kernel.py new file mode 100755 index 000000000..8be7c9f9c --- /dev/null +++ b/kernel/riscv64/generate_kernel.py @@ -0,0 +1,673 @@ +#!/usr/bin/python3 + +import sys, os +import contextlib + +#----------------------------------------------------------------------- +def ERROR(*args, **kwargs): + print(*args, file=sys.stderr, **kwargs) + sys.exit(-1) + +class Target(object): + def __init__( self, out, mappings, initial_level=0, tab_width=4 ): + self._level = initial_level + self._tab_width = tab_width + self._out = out + self._mappings = mappings + + @contextlib.contextmanager + def map( self, **items ): + old_mappings = self._mappings + self._mappings = dict(old_mappings, **items) + yield self._mappings + self._mappings = old_mappings + + @contextlib.contextmanager + def block( self, start=None, end=None, **args ): + with self.map(**args): + if start is not None: + self.write(); + self.write(start) + self._level += 1 + yield self._level + self._level -= 1 + if end is not None: + self.write(end) + self.write() + + def write( self, fmt=None, *args, **kwargs ): + if fmt is not None: + mappings = dict(self._mappings, **kwargs) if kwargs else self._mappings + self._out(self._indent_str() + fmt.format(*args, **mappings)) + else: + self._out("") + + def _indent_str( self ): + return ' ' * (self._level * self._tab_width) + +#----------------------------------------------------------------------- +def generate_trmm_block( dest ): + dest.write("{index_type} pass_K = K;") + dest.write("#ifdef LEFT") + with dest.block(): + dest.write("{index_type} off = offset + m_top;") + dest.write("#else") + with dest.block(): + dest.write("{index_type} off = -offset + n_top;") + dest.write("#endif") + + dest.write("#ifdef BACKWARDS") + with dest.block(): + dest.write("ai += off*{M}{elt_size};") + dest.write("bi += off*{N}{elt_size};") + dest.write("pass_K -= off;") + dest.write("#else") + with dest.block(): + dest.write("#ifdef LEFT") + with dest.block(): + dest.write("pass_K = off + {M};") + dest.write("#else") + with dest.block(): + dest.write("pass_K = off + {N};") + dest.write("#endif") + dest.write("#endif") + +#----------------------------------------------------------------------- +def generate_gemm_kernel_inner_real( settings, dest, M, N, vlen, a_regs ): + TRMM = (settings['op'].value == 'trmm') + narrow_result = (settings['param_precision'].value != 'double') and settings['force_acc_double'].value + + with dest.map( + M=M, + N=N, + ): + dest.write("{index_type} ai=m_top*K{elt_size};") + dest.write("{index_type} bi=n_top*K{elt_size};") + if TRMM: + generate_trmm_block( dest ) + + for i in range(N): + dest.write("{param_scalar_t} B{i} = B[bi+{i}];", i=i) + dest.write("bi += {N};") + dest.write() + + for i in range(a_regs): + dest.write("{param_vector_t} A{i} = {VLEV}( &A[ai+{i}*gvl], gvl );", i=i) + dest.write("ai += {M};") + dest.write() + + for j in range(N): + for i in range(a_regs): + dest.write("{acc_vector_t} result{dest} = {VMUL_TO_ACC}( A{i}, B{j}, gvl);", dest=j*a_regs+i, i=i, j=j) + + with dest.block("for({index_type} k=1; k<{Kend}; k++) {{", "}}", Kend=('pass_K' if TRMM else 'K')): + for i in range(N): + dest.write("B{i} = B[bi+{i}];", i=i ) + dest.write("bi += {N};") + dest.write() + + for i in range(a_regs): + dest.write("A{i} = {VLEV}( &A[ai+{i}*gvl], gvl );", i=i) + + dest.write("ai += {M};") + dest.write() + + + for j in range(N): + for i in range(a_regs): + dest.write("result{dest} = {VMACC_TO_ACC}( result{dest}, B{j}, A{i}, gvl);", dest= j*a_regs+i, j=j, i=i ) + + dest.write() + dest.write("{index_type} ci=n_top*ldc+m_top;") + dest.write() + + if narrow_result: + for j in range(N): + for i in range(a_regs): + dest.write("{param_vector_t} narrowed{idx} = {VFNCVT}( result{idx}, gvl );", idx=j*a_regs+i) + + if not TRMM: + for j in range(N): + for i in range(a_regs): + idx = j*a_regs+i + increment = ' ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;' + if idx == N*a_regs-1: + increment = '' + dest.write("{param_vector_t} c{idx} = {VLEV}( &C[ci], gvl);{increment}", idx=idx, increment=increment) + + if narrow_result: + for j in range(N): + for i in range(a_regs): + idx = j*a_regs+i + if TRMM: + dest.write("{param_vector_t} c{idx} = {VFMUL}( narrowed{idx}, alpha, gvl );", idx=idx) + else: + dest.write("c{idx} = {VFMACC}( c{idx}, alpha, narrowed{idx}, gvl );", idx=idx) + else: + for j in range(N): + for i in range(a_regs): + idx = j*a_regs+i + if TRMM: + dest.write("{param_vector_t} c{idx} = {VFMUL}( result{idx}, alpha, gvl );", idx=idx) + else: + dest.write("c{idx} = {VFMACC}( c{idx}, alpha, result{idx}, gvl );", idx=idx) + + + if not TRMM: + dest.write() + dest.write("ci=n_top*ldc+m_top;") + dest.write() + + for j in range(N): + for i in range(a_regs): + idx = j*a_regs+i + increment = ' ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;' + if idx == N*a_regs-1: + increment = '' + dest.write("{VSEV}( &C[ci], c{idx}, gvl);{increment}", idx=idx, increment=increment) + + +#----------------------------------------------------------------------- +def generate_gemm_kernel_inner_complex( settings, dest, M, N, vlen, a_regs ): + TRMM = (settings['op'].value == 'trmm') + narrow_result = (settings['param_precision'].value != 'double') and settings['force_acc_double'].value + + if narrow_result: + raise RuntimeError("wide accumulator not supported for generated complex kernels") + # we could, but we run out of registers really really fast + + with dest.map( + M=M, + N=N, + ): + dest.write("{index_type} ai=m_top*K*2;") + dest.write("{index_type} bi=n_top*K*2;") + if TRMM: + generate_trmm_block( dest ) + + for i in range(N): + dest.write("{param_scalar_t} B{i}r = B[bi+{i}*2+0];", i=i) + dest.write("{param_scalar_t} B{i}i = B[bi+{i}*2+1];", i=i) + dest.write("bi += {N}*2;") + dest.write() + + for i in range(a_regs): + dest.write("{param_vector_t} A{i}r = {VLSEV}( &A[ai+{i}*gvl*2], sizeof(FLOAT)*2, gvl );", i=i) + dest.write("{param_vector_t} A{i}i = {VLSEV}( &A[ai+{i}*gvl*2+1], sizeof(FLOAT)*2, gvl );", i=i) + dest.write("ai += {M}*2;") + dest.write() + + # for each vector register loaded from matrix A, we require N registers to hold vector-scalar multiply-accumulate results + accumulation_regs = a_regs * N + dest.write("// {a_regs} vector regs to hold A array contents, {accumulation_regs} regs to hold values accumulated over k", + a_regs=a_regs*2, accumulation_regs=accumulation_regs*2 + ) + pass_regs = (accumulation_regs + a_regs)*2 + tmp_regs = (32 // settings['LMUL_ACC'].value) - pass_regs + if tmp_regs < 2: + raise RuntimeError("Complex kernel would use too many registers!") + + dest.write("// leaving {tmp_regs} vector registers for temporaries", tmp_regs=tmp_regs) + + tmp_unroll_i = min(tmp_regs, a_regs) + tmp_unroll_j = N + while tmp_unroll_j > 1 and (tmp_regs/(tmp_unroll_i*2)) < tmp_unroll_j: + tmp_unroll_j = int(tmp_unroll_j / 2) + + if tmp_unroll_i < a_regs or tmp_unroll_j < N: + dest.write("// performing {ops} operations between reuses of temporaries", ops=tmp_unroll_j*tmp_unroll_i) + + for tj in range(0, N, tmp_unroll_j): + for ti in range(0, a_regs, tmp_unroll_i): + for j in range(tj, tj+tmp_unroll_j): + for i in range(ti, ti+tmp_unroll_i): + with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): + if ti == 0 and tj==0: + dest.write("{acc_vector_t} tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);") + dest.write("{acc_vector_t} tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);") + else: + dest.write("tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);") + dest.write("tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);") + for j in range(tj, tj+tmp_unroll_j): + for i in range(ti, ti+tmp_unroll_i): + with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): + dest.write("tmp{tmp}r = VFMACC_RR( tmp{tmp}r, B{j}r, A{i}r, gvl);") + dest.write("tmp{tmp}i = VFMACC_RI( tmp{tmp}i, B{j}r, A{i}i, gvl);") + + for j in range(tj, tj+tmp_unroll_j): + for i in range(ti, ti+tmp_unroll_i): + with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): + dest.write("{acc_vector_t} ACC{dest}r = tmp{tmp}r;") + dest.write("{acc_vector_t} ACC{dest}i = tmp{tmp}i;") + + with dest.block("for({index_type} k=1; k<{Kend}; k++) {{", "}}", Kend=('pass_K' if TRMM else 'K')): + for i in range(N): + dest.write("B{i}r = B[bi+{i}*2+0];", i=i) + dest.write("B{i}i = B[bi+{i}*2+1];", i=i) + dest.write("bi += {N}*2;") + dest.write() + + for i in range(a_regs): + dest.write("A{i}r = {VLSEV}( &A[ai+{i}*gvl*2], sizeof(FLOAT)*2, gvl );", i=i) + dest.write("A{i}i = {VLSEV}( &A[ai+{i}*gvl*2+1], sizeof(FLOAT)*2, gvl );", i=i) + + dest.write("ai += {M}*2;") + dest.write() + + + for tj in range(0, N, tmp_unroll_j): + for ti in range(0, a_regs, tmp_unroll_i): + # note the values in tmp{tmp}* are frequently of similar magnitude and opposite sign + # so accumulating them directly to ACC would lose precision when ACC is larger + + for j in range(tj, tj+tmp_unroll_j): + for i in range(ti, ti+tmp_unroll_i): + with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): + dest.write("tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);") + dest.write("tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);") + for j in range(tj, tj+tmp_unroll_j): + for i in range(ti, ti+tmp_unroll_i): + with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): + dest.write("tmp{tmp}r = VFMACC_RR( tmp{tmp}r, B{j}r, A{i}r, gvl);") + dest.write("tmp{tmp}i = VFMACC_RI( tmp{tmp}i, B{j}r, A{i}i, gvl);") + for j in range(tj, tj+tmp_unroll_j): + for i in range(ti, ti+tmp_unroll_i): + with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): + dest.write("ACC{dest}r = {__riscv_}vfadd( ACC{dest}r, tmp{tmp}r, gvl);") + dest.write("ACC{dest}i = {__riscv_}vfadd( ACC{dest}i, tmp{tmp}i, gvl);") + + dest.write() + dest.write("{index_type} ci=n_top*ldc+m_top;") + dest.write() + + for j in range(N): + if TRMM: + for i in range(a_regs): + with dest.map(idx=j*a_regs+i): + dest.write("{param_vector_t} C{idx}r = {__riscv_}vfmul( ACC{idx}r, alphar, gvl );") + dest.write("{param_vector_t} C{idx}i = {__riscv_}vfmul( ACC{idx}i, alphar, gvl );") + else: + for i in range(a_regs): + idx = j*a_regs+i + increment = 'ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;' + if idx == N*a_regs-1: + increment = '' + with dest.map(idx=j*a_regs+i, increment=increment): + dest.write("{param_vector_t} C{idx}r = {VLSEV}( &C[ci*2+0], sizeof(FLOAT)*2, gvl );") + dest.write("{param_vector_t} C{idx}i = {VLSEV}( &C[ci*2+1], sizeof(FLOAT)*2, gvl );") + dest.write("{increment}") + + if not TRMM: + for j in range(N): + for i in range(a_regs): + with dest.map(idx=j*a_regs+i): + dest.write("C{idx}r = {__riscv_}vfmacc( C{idx}r, alphar, ACC{idx}r, gvl );") + dest.write("C{idx}i = {__riscv_}vfmacc( C{idx}i, alphar, ACC{idx}i, gvl );") + + for j in range(N): + for i in range(a_regs): + with dest.map(idx=j*a_regs+i): + dest.write("C{idx}r = {__riscv_}vfnmsac( C{idx}r, alphai, ACC{idx}i, gvl );") + dest.write("C{idx}i = {__riscv_}vfmacc ( C{idx}i, alphai, ACC{idx}r, gvl );") + + if not TRMM: + dest.write() + dest.write("ci=n_top*ldc+m_top;") + dest.write() + + for j in range(N): + for i in range(a_regs): + idx = j*a_regs+i + increment = 'ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;' + if idx == N*a_regs-1: + increment = '' + with dest.map(idx=j*a_regs+i, increment=increment): + dest.write("{VSSEV}( &C[ci*2+0], sizeof(FLOAT)*2, C{idx}r, gvl);") + dest.write("{VSSEV}( &C[ci*2+1], sizeof(FLOAT)*2, C{idx}i, gvl);") + dest.write("{increment}") + +#----------------------------------------------------------------------- +def generate_gemm_kernel( settings, OUTPUT ): + if settings['conjugate'].value: + ERROR('conjugate gemm not yet supported') + + is_complex = settings['complex'].value + generate_gemm_kernel_inner = generate_gemm_kernel_inner_complex if is_complex else generate_gemm_kernel_inner_real + dest = Target(OUTPUT, { k:str(settings[k].value) for k in settings }) + + M = settings['M'].value + N = settings['N'].value + vlenmax = int(settings['reg_width_bits'].value * settings['LMUL_ACC'].value / + settings['ELEN_PARAM'].value) + a_regs = max(int(M/vlenmax), 1) + + # for each vector register loaded from matrix A, we require N registers to hold vector-scalar multiply-accumulate results + accumulation_regs = a_regs * N + required_regs = accumulation_regs + a_regs + if is_complex: + required_regs = required_regs * 2 + 2 + dest.write(''' +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + #define S0 1 + #define S1 -1 + #define S2 1 + #define S3 1 + #define VFMACC_RR __riscv_vfmsac{tail_policy} + #define VFMACC_RI __riscv_vfmacc{tail_policy} +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + #define S0 1 + #define S1 1 + #define S2 1 + #define S3 -1 + #define VFMACC_RR __riscv_vfmacc{tail_policy} + #define VFMACC_RI __riscv_vfmsac{tail_policy} +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + #define S0 1 + #define S1 1 + #define S2 -1 + #define S3 1 + #define VFMACC_RR __riscv_vfmacc{tail_policy} + #define VFMACC_RI __riscv_vfnmsac{tail_policy} +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + #define S0 1 + #define S1 -1 + #define S2 -1 + #define S3 -1 + #define VFMACC_RR __riscv_vfmsac{tail_policy} + #define VFMACC_RI __riscv_vfnmacc{tail_policy} +#endif +'''.format(tail_policy=settings['tail_policy'].value)) + + + if required_regs > (32 // settings['LMUL_ACC'].value): + raise Exception("{} vector registers needed during accumulation for unrolling {} x {}{} but only {} are available".format( + required_regs, N, M, (" with wide accumulator" if settings['LMUL_ACC'].value > 1 else ''), 32 // settings['LMUL_ACC'].value + )) + + TRMM = (settings['op'].value == 'trmm') + if TRMM: + with dest.block("#if defined(LEFT) != defined(TRANSA)", "#endif"): + dest.write("#define BACKWARDS") + + dest.write("int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, {alpha}, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc{trmm})", + alpha = ('FLOAT alphar, FLOAT alphai' if is_complex else 'FLOAT alpha'), + trmm = (', BLASLONG offset' if TRMM else '') + ) + + with dest.block("{{", "}}", elt_size='*2' if is_complex else ''): + if settings['trace'].value: + dest.write("printf(\"\\n\\nENTRY: %s(%d) M %d N %d K %d ldc %d\\n\", __FILE__, __LINE__, M, N, K, ldc);") + dest.write("{index_type} gvl = 0;") + dest.write("{index_type} m_top = 0;") + dest.write("{index_type} n_top = 0;") + + dest.write() + dest.write() + dest.write("// -- MAIN PASS") + + with dest.block("for ({index_type} j=0; j 0 ): + with dest.map(N=N_tail): + dest.write() + dest.write() + dest.write("// -- tails for N={N}") + with dest.block("if( N & {N} ) {{", "}}" ): + if settings['trace'].value: + dest.write("printf(\"N tail entry: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);") + dest.write("gvl = {VSETVL}({vlenmax});", vlenmax=min(vlenmax,max(int(M/a_regs),1))) + dest.write("m_top = 0;") + with dest.block("for ({index_type} i=0; i M_tail_min ): + with dest.block("if( M & {M_tail} ) {{", "}}", M_tail=M_tail ): + if settings['trace'].value: + dest.write("printf(\"tail: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);") + a_regs = max( 1, int(M_tail/vlenmax) ) + vlen = int(M_tail/a_regs) + dest.write("gvl = {VSETVL}({vlen});\n", vlen=vlen) + + generate_gemm_kernel_inner( settings, dest, M_tail, N, vlen, a_regs ) + dest.write( "m_top += {M_tail};" ) + + M_tail = int( M_tail / 2 ) + + while( M_tail > 0 ): + with dest.block("if( M & {M_tail} ) {{", "}}", + M_tail=M_tail, + N=N, + result_t = ('double' if settings['force_acc_double'].value else settings['param_scalar_t'].value) + ): + if settings['trace'].value: + dest.write("printf(\"tail: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);") + for r in range(M_tail * N * (2 if is_complex else 1)): + dest.write("{result_t} result{r} = 0;", + r=r + ) + + dest.write("{index_type} ai=m_top*K{elt_size};") + dest.write("{index_type} bi=n_top*K{elt_size};") + + if TRMM: + with dest.map(M=M_tail, N=N): + generate_trmm_block( dest ) + + with dest.block("for({index_type} k=0; k<{Kend}; k++) {{", "}}", Kend = ('pass_K' if TRMM else 'K') ): + for ki in range( N ): + for kj in range( M_tail ): + if is_complex: + dest.write("result{dest}+=S0*A[ai+{kj}+0]*B[bi+{ki}+0] + S1*A[ai+{kj}+1]*B[bi+{ki}+1];".format( + dest=(ki*M_tail+kj)*2, kj=kj*2, ki=ki*2 + )) + dest.write("result{dest}+=S2*A[ai+{kj}+1]*B[bi+{ki}+0] + S3*A[ai+{kj}+0]*B[bi+{ki}+1];".format( + dest=(ki*M_tail+kj)*2+1, kj=kj*2, ki=ki*2 + )) + else: + dest.write("result{dest}+=A[ai+{kj}]*B[bi+{ki}];".format( + dest=ki*M_tail+kj, kj=kj, ki=ki + )) + dest.write("ai+={M_tail}{elt_size};") + dest.write("bi+={N}{elt_size};") + + dest.write("{index_type} ci=n_top*ldc+m_top;") + if is_complex: + dest.write("{result_t} Cr, Ci;") + for ki in range( N ): + for kj in range( M_tail ): + if is_complex: + if TRMM: + dest.write('Cr = result{dest}*alphar;', dest=(ki*M_tail+kj)*2+0) + dest.write('Ci = result{dest}*alphar;', dest=(ki*M_tail+kj)*2+1) + else: + dest.write('Cr = C[(ci+{ki}*ldc+{kj})*2+0];', ki=ki, kj=kj) + dest.write('Ci = C[(ci+{ki}*ldc+{kj})*2+1];', ki=ki, kj=kj) + dest.write('Cr += result{dest}*alphar;', dest=(ki*M_tail+kj)*2+0) + dest.write('Ci += result{dest}*alphar;', dest=(ki*M_tail+kj)*2+1) + dest.write('Cr -= result{dest}*alphai;', dest=(ki*M_tail+kj)*2+1) + dest.write('Ci += result{dest}*alphai;', dest=(ki*M_tail+kj)*2+0) + dest.write("C[(ci+{ki}*ldc+{kj})*2+0] = Cr;", ki=ki, kj=kj ) + dest.write("C[(ci+{ki}*ldc+{kj})*2+1] = Ci;", ki=ki, kj=kj ) + else: + op = '' if TRMM else '+' + dest.write("C[ci+{ki}*ldc+{kj}] {op}= alpha * result{dest};", + ki=ki, kj=kj, op=op, dest=ki*M_tail+kj + ) + dest.write("m_top+={M_tail};") + + M_tail = int(M_tail/2) + + +#----------------------------------------------------------------------- +class Setting(object): + def __init__( self, value, convert = None ): + self._value = value + self._convert = convert + + @classmethod + def ENUM( cls, *values ): + def closure( values ): + return lambda value: values[value.lower()] + return closure( { v.lower():v for v in values } ) + + @classmethod + def BOOL( cls, value ): + return value.lower().startswith('t') or value == '1' + + @property + def value( self ): + return self._value + + @property + def configurable( self ): + return self._convert is not None + + @value.setter + def value( self, value ): + self._value = self._convert( value ) + + def __str__( self ): + return str(self._value) + +#----------------------------------------------------------------------- +def main(): + settings = { + 'op': Setting( 'gemm', Setting.ENUM( 'gemm', 'trmm' ) ), + 'M': Setting( 16, int ), + 'N': Setting( 4, int ), + 'reg_width_bits': Setting( 256, int ), + 'LMUL': Setting( 1, int ), + 'M_tail_scalar_from':Setting( 2, int ), + 'cpu': Setting( 'zvl256b', str ), + 'param_precision': Setting( 'float', Setting.ENUM( 'float', 'double' ) ), + 'force_acc_double': Setting( False, Setting.BOOL ), + 'complex': Setting( False, Setting.BOOL ), + 'conjugate': Setting( False, Setting.BOOL ), + 'index_type': Setting( 'BLASLONG', str ), + 'trace': Setting( False, Setting.BOOL ), + 'output': Setting( None, str ), + 'tail_policy': Setting( '', str ), # _ta, if toolchain supports it + '__riscv_': Setting( '__riscv_', str), + } + + for item in sys.argv[1:]: + try: + name, value = tuple(item.split( '=', 1 )) + except: + ERROR("couldn't parse {}, expected arguments of the form name=value".format(item)) + + if name not in settings: + ERROR("couldn't parse {}, {} it is not a known option\n".format( item, name ) + +"options (and current defaults) are\n{}".format( + " ".join([ '{}={}'.format(k, settings[k].value) for k in settings.keys()])) + ) + + try: + settings[name].value = value + except: + import traceback + traceback.print_exc() + ERROR("couldn't parse {}".format(item)) + + if settings['output'].value is None: + if settings['complex'].value: + prefix = 'z' if settings['param_precision'].value == 'double' else 'c' + else: + prefix = 'd' if settings['param_precision'].value == 'double' else 's' + settings['output'] = Setting('{}{}_kernel_{}x{}_{}.c'.format( + prefix, + settings['op'], + settings['M'], + settings['N'], + settings['cpu'] + )) + + if settings['param_precision'].value == 'double': + settings['param_scalar_t'] = Setting( 'double' ) + settings['ELEN_PARAM'] = Setting(64) + else: + settings['param_scalar_t'] = Setting( 'float' ) + settings['ELEN_PARAM'] = Setting(32) + + settings['VFMUL'] = Setting( '{}vfmul_vf_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy']) ) + settings['VFMACC'] = Setting( '{}vfmacc_vf_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy']) ) + + settings['ELEN_ACC'] = settings['ELEN_PARAM'] + settings['LMUL_ACC'] = Setting(settings['LMUL'].value) + widen = '' + + if settings['force_acc_double'].value and (settings['param_precision'].value == 'float'): + settings['ELEN_ACC'] = Setting(64) + settings['LMUL_ACC'] = Setting(settings['LMUL'].value*2) + settings['VFNCVT'] = Setting('{}vfncvt_f_f_w_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy'])) + widen = 'w' + + settings['VMUL_TO_ACC'] = Setting( '{}vf{}mul_vf_f{}m{}{}'.format(settings['__riscv_'], widen, settings['ELEN_ACC'], settings['LMUL_ACC'], settings['tail_policy']) ) + settings['VMACC_TO_ACC'] = Setting( '{}vf{}macc_vf_f{}m{}{}'.format(settings['__riscv_'], widen, settings['ELEN_ACC'], settings['LMUL_ACC'], settings['tail_policy']) ) + + settings['param_vector_t']=Setting('vfloat{}m{}_t'.format(settings['ELEN_PARAM'], settings['LMUL'])) + settings['acc_vector_t'] =Setting('vfloat{}m{}_t'.format(settings['ELEN_ACC'], settings['LMUL_ACC'])) + settings['VLEV'] =Setting('{}vle{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL'])) + settings['VSEV'] =Setting('{}vse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL'])) + settings['VLSEV'] =Setting('{}vlse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL'])) + settings['VSSEV'] =Setting('{}vsse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL'])) + settings['VSETVL'] =Setting('{}vsetvl_e{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'])) + + + to_stdout = (settings['output'].value == '-') + if not to_stdout: + print("Writing {}".format(settings['output'].value), file=sys.stderr) + + with open(sys.stdout.fileno() if to_stdout else settings['output'].value, 'w') as destination_file: + def OUTPUT(*args, **kwargs): + print(*args, file=destination_file, **kwargs) + + OUTPUT("/*\n\nAUTOGENERATED KERNEL\nSettings:\n {}".format(" ".join([ "{}={}\n".format(k, repr(settings[k].value)) for k in sorted(settings.keys()) if settings[k].configurable]))) + OUTPUT("Derived:\n {}\n*/\n".format(" ".join([ "{}={}\n".format(k, repr(settings[k].value)) for k in sorted(settings.keys()) if not settings[k].configurable]))) + + OUTPUT('#include "common.h"') + OUTPUT("\n") + + if settings['op'].value in ('gemm', 'trmm'): + generate_gemm_kernel(settings, OUTPUT) + else: + ERROR("unsupported kernel type {}".format(settings['op'])) + +if __name__ == "__main__": + main() diff --git a/kernel/riscv64/iamax_rvv.c b/kernel/riscv64/iamax_rvv.c new file mode 100644 index 000000000..8362d7cef --- /dev/null +++ b/kernel/riscv64/iamax_rvv.c @@ -0,0 +1,149 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m8 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m8_tu +#define VFIRSTM __riscv_vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m8_tumu +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m8_tumu +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u64m8_u64 +#else +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m8 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m8_tu +#define VFIRSTM __riscv_vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m8_tumu +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m8_tumu +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u32m8_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int max_index = 0; + if (n <= 0 || inc_x <= 0) return(max_index); + + FLOAT_V_T vx, v_max; + UINT_V_T v_max_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_max_index = VMVVX_UINT(0, vlmax); + v_max = VFMVVF_FLOAT(-1, vlmax); + BLASLONG j=0; + FLOAT maxf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, j += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vx = VFABSV_FLOAT(vx, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, vl); + v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); + + //update v_max + v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vx = VFABSV_FLOAT(vx, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, vl); + v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); + + //update v_max + v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl); + } + + } + + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + + v_res = VFREDMAXVS_FLOAT(v_max, v_res, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + mask = VMFGEVF_FLOAT(v_max, maxf, vlmax); + max_index = VFIRSTM(mask, vlmax); + + v_max_index = VSLIDEDOWN_UINT(v_max_index, max_index, vlmax); + max_index = VMVVXS_UINT(v_max_index); + + return(max_index+1); +} diff --git a/kernel/riscv64/iamax_vector.c b/kernel/riscv64/iamax_vector.c index 4242af6ea..800312400 100644 --- a/kernel/riscv64/iamax_vector.c +++ b/kernel/riscv64/iamax_vector.c @@ -27,127 +27,123 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include +#include #if defined(DOUBLE) -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t + +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMAXVV_FLOAT vfmax_vv_f64m8 -#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 -#define VMFIRSTM vmfirst_m_b8 -#define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VMVVX_UINT vmv_v_x_u64m8 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMAXVS_FLOAT(va, vb, gvl) vfredmax_vs_f64m4_f64m1(v_res, va, vb, gvl) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m4_m) +#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m4)(vm, compressed, va, gvl) +#else +#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f64m4_f64m1) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m4_mu) +#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m4) +#endif +#define MASK_T vbool16_t +#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f64m4_b16) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f64m4) +#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f64m4_b16) +#define VMFIRSTM RISCV_RVV(vfirst_m_b16) +#define UINT_V_T vuint64m4_t +#define VIDV_UINT RISCV_RVV(vid_v_u64m4) +#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m4) +#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m4) +#define VFABS_FLOAT RISCV_RVV(vfabs_v_f64m4) +#define VMV_X RISCV_RVV(vmv_x_s_u64m4_u64) #else -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t + +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMAXVV_FLOAT vfmax_vv_f32m8 -#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 -#define VMFIRSTM vmfirst_m_b4 -#define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VMVVX_UINT vmv_v_x_u32m8 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMAXVS_FLOAT(va, vb, gvl) vfredmax_vs_f32m4_f32m1(v_res, va, vb, gvl) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m4_m) +#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m4)(vm, compressed, va, gvl) +#else +#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f32m4_f32m1) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m4_mu) +#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m4) +#endif +#define MASK_T vbool8_t +#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f32m4_b8) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f32m4) +#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f32m4_b8) +#define VMFIRSTM RISCV_RVV(vfirst_m_b8) +#define UINT_V_T vuint32m4_t +#define VIDV_UINT RISCV_RVV(vid_v_u32m4) +#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m4) +#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m4) +#define VFABS_FLOAT RISCV_RVV(vfabs_v_f32m4) +#define VMV_X RISCV_RVV(vmv_x_s_u32m4_u32) #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; - FLOAT maxf=0.0; -#ifdef DOUBLE - BLASLONG max_index = 0; -#else + BLASLONG i=0, j=0; unsigned int max_index = 0; -#endif - if (n <= 0 || inc_x <= 0) return(max_index); + if (n <= 0 || inc_x <= 0) return(max_index); + FLOAT maxf=-FLT_MAX; FLOAT_V_T vx, v_max; UINT_V_T v_max_index; MASK_T mask; unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_z0; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1); + + gvl = VSETVL(n); + UINT_V_T vid = VIDV_UINT(gvl); if(inc_x == 1){ - gvl = VSETVL(n); v_max_index = VMVVX_UINT(0, gvl); - v_max = VFMVVF_FLOAT(-1, gvl); + v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLEV_FLOAT(&x[j], gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + vx = VFABS_FLOAT(vx, gvl); //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, vid, j, gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; } - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - maxf = VFMVFS_FLOAT(v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + maxf = EXTRACT_FLOAT(v_res); mask = VMFGEVF_FLOAT(v_max, maxf, gvl); - max_index = VMFIRSTM(mask,gvl); -#ifdef DOUBLE - max_index = *((BLASLONG *)&v_max_index+max_index); -#else - max_index = *((unsigned int *)&v_max_index+max_index); -#endif + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); + if(j < n){ gvl = VSETVL(n-j); - vx = VLEV_FLOAT(&x[j], gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + v_max = VLEV_FLOAT(&x[j], gvl); + v_max = VFABS_FLOAT(v_max, gvl); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - FLOAT cur_maxf = VFMVFS_FLOAT(v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + FLOAT cur_maxf = EXTRACT_FLOAT(v_res); if(cur_maxf > maxf){ //tail index - v_max_index = VIDV_UINT(gvl); - v_max_index = VADDVX_UINT(v_max_index, j, gvl); + v_max_index = VADDVX_UINT(vid, j, gvl); mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); - max_index = VMFIRSTM(mask,gvl); -#ifdef DOUBLE - max_index = *((BLASLONG*)&v_max_index+max_index); -#else - max_index = *((unsigned int*)&v_max_index+max_index); -#endif + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); } } }else{ @@ -155,56 +151,48 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) unsigned int stride_x = inc_x * sizeof(FLOAT); unsigned int idx = 0, inc_v = gvl * inc_x; + v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); v_max_index = VMVVX_UINT(0, gvl); - v_max = VFMVVF_FLOAT(-1, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + vx = VFABS_FLOAT(vx, gvl); //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, vid, j, gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; idx += inc_v; } - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - maxf = VFMVFS_FLOAT(v_res); + + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + maxf = EXTRACT_FLOAT(v_res); mask = VMFGEVF_FLOAT(v_max, maxf, gvl); - max_index = VMFIRSTM(mask,gvl); -#ifdef DOUBLE - max_index = *((BLASLONG*)&v_max_index+max_index); -#else - max_index = *((unsigned int*)&v_max_index+max_index); -#endif + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); + if(j < n){ gvl = VSETVL(n-j); - vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); + v_max = VFABS_FLOAT(v_max, gvl); + + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + FLOAT cur_maxf = EXTRACT_FLOAT(v_res); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - FLOAT cur_maxf = VFMVFS_FLOAT(v_res); if(cur_maxf > maxf){ //tail index - v_max_index = VIDV_UINT(gvl); - v_max_index = VADDVX_UINT(v_max_index, j, gvl); + v_max_index = VADDVX_UINT(vid, j, gvl); mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); - max_index = VMFIRSTM(mask,gvl); -#ifdef DOUBLE - max_index = *((BLASLONG*)&v_max_index+max_index); -#else - max_index = *((unsigned int*)&v_max_index+max_index); -#endif + + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); } } } - return(max_index+1); + return(max_index+1); } diff --git a/kernel/riscv64/iamin_rvv.c b/kernel/riscv64/iamin_rvv.c new file mode 100644 index 000000000..f90dbb545 --- /dev/null +++ b/kernel/riscv64/iamin_rvv.c @@ -0,0 +1,150 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m8 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m8_tu +#define VFIRSTM __riscv_vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m8_tumu +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m8_tumu +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u64m8_u64 +#else +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m8 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m8_tu +#define VFIRSTM __riscv_vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m8_tumu +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m8_tumu +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u32m8_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int min_index = 0; + if (n <= 0 || inc_x <= 0) return(min_index); + + FLOAT_V_T vx, v_min; + UINT_V_T v_min_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_min_index = VMVVX_UINT(0, vlmax); + v_min = VFMVVF_FLOAT(FLT_MAX, vlmax); + BLASLONG j=0; + FLOAT minf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, j += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vx = VFABSV_FLOAT(vx, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, vl); + v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vx = VFABSV_FLOAT(vx, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, vl); + v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl); + } + + } + + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); + + v_res = VFREDMINVS_FLOAT(v_min, v_res, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + mask = VMFLEVF_FLOAT(v_min, minf, vlmax); + min_index = VFIRSTM(mask, vlmax); + + v_min_index = VSLIDEDOWN_UINT(v_min_index, min_index, vlmax); + min_index = VMVVXS_UINT(v_min_index); + + return(min_index+1); +} diff --git a/kernel/riscv64/iamin_vector.c b/kernel/riscv64/iamin_vector.c index 4e81e7848..0e591e697 100644 --- a/kernel/riscv64/iamin_vector.c +++ b/kernel/riscv64/iamin_vector.c @@ -31,85 +31,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) -#define ABS fabs -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n) #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMINVV_FLOAT vfmin_vv_f64m8 -#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 -#define VMFIRSTM vmfirst_m_b8 -#define UINT_V_T vuint64m8_t +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMINVS_FLOAT(va, vb, gvl) vfredmin_vs_f64m8_f64m1(v_res, va, vb, gvl) #define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 #define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VMVVX_UINT vmv_v_x_u64m8 +#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl) +#else +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu +#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8) +#endif +#define MASK_T vbool8_t +#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f64m8_b8) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f64m8) +#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f64m8_b8) +#define VMFIRSTM RISCV_RVV(vfirst_m_b8) +#define UINT_V_T vuint64m8_t +#define VIDV_UINT RISCV_RVV(vid_v_u64m8) +#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8) +#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8) +#define VFABS_FLOAT RISCV_RVV(vfabs_v_f64m8) +#define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64) #else -#define ABS fabsf -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n) #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMINVV_FLOAT vfmin_vv_f32m8 -#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 -#define VMFIRSTM vmfirst_m_b4 -#define UINT_V_T vuint32m8_t +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMINVS_FLOAT(va, vb, gvl) vfredmin_vs_f32m8_f32m1(v_res, va, vb, gvl) #define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 #define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VMVVX_UINT vmv_v_x_u32m8 +#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl) +#else +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu +#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8) +#endif +#define MASK_T vbool4_t +#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f32m8_b4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f32m8) +#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f32m8_b4) +#define VMFIRSTM RISCV_RVV(vfirst_m_b4) +#define UINT_V_T vuint32m8_t +#define VIDV_UINT RISCV_RVV(vid_v_u32m8) +#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8) +#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8) +#define VFABS_FLOAT RISCV_RVV(vfabs_v_f32m8) +#define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32) #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; - FLOAT minf=FLT_MAX; + BLASLONG i=0, j=0; unsigned int min_index = 0; - if (n <= 0 || inc_x <= 0) return(min_index); + if (n <= 0 || inc_x <= 0) return(min_index); + FLOAT minf=FLT_MAX; FLOAT_V_T vx, v_min; UINT_V_T v_min_index; MASK_T mask; unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_max; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); if(inc_x == 1){ gvl = VSETVL(n); - v_min = VFMVVF_FLOAT(FLT_MAX, gvl); v_min_index = VMVVX_UINT(0, gvl); + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLEV_FLOAT(&x[j], gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + vx = VFABS_FLOAT(vx, gvl); - //index where element less than v_min - mask = VMFLTVV_FLOAT(vx, v_min, gvl); + //index where element greater than v_min + mask = VMFGTVV_FLOAT(v_min, vx, gvl); v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); @@ -117,29 +125,29 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; } - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - minf = *((FLOAT*)&v_res); + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + minf = EXTRACT_FLOAT(v_res); mask = VMFLEVF_FLOAT(v_min, minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); - vx = VLEV_FLOAT(&x[j], gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + v_min = VLEV_FLOAT(&x[j], gvl); + v_min = VFABS_FLOAT(v_min, gvl); - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - FLOAT cur_minf = *((FLOAT*)&v_res); + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + FLOAT cur_minf = EXTRACT_FLOAT(v_res); if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); } } }else{ @@ -151,12 +159,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_min_index = VMVVX_UINT(0, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + vx = VFABS_FLOAT(vx, gvl); - //index where element less than v_min - mask = VMFLTVV_FLOAT(vx, v_min, gvl); + //index where element greater than v_min + mask = VMFGTVV_FLOAT(v_min, vx, gvl); v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); @@ -165,33 +171,31 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) j += gvl; idx += inc_v; } - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - minf = *((FLOAT*)&v_res); + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + minf = EXTRACT_FLOAT(v_res); mask = VMFLEVF_FLOAT(v_min, minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); - vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); + v_min = VFABS_FLOAT(v_min, gvl); - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - FLOAT cur_minf = *((FLOAT*)&v_res); + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + FLOAT cur_minf = EXTRACT_FLOAT(v_res); if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); } } } - return(min_index+1); + return(min_index+1); } - - diff --git a/kernel/riscv64/imax_rvv.c b/kernel/riscv64/imax_rvv.c new file mode 100644 index 000000000..b1a77b178 --- /dev/null +++ b/kernel/riscv64/imax_rvv.c @@ -0,0 +1,146 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m8_tu +#define VFIRSTM __riscv_vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m8_tumu +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m8_tumu +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u64m8_u64 +#else +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m8_tu +#define VFIRSTM __riscv_vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m8_tumu +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m8_tumu +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u32m8_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int max_index = 0; + if (n <= 0 || inc_x <= 0) return(max_index); + + FLOAT_V_T vx, v_max; + UINT_V_T v_max_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_max_index = VMVVX_UINT(0, vlmax); + v_max = VFMVVF_FLOAT(-FLT_MAX, vlmax); + BLASLONG j=0; + FLOAT maxf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, j += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, vl); + v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, vl); + v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl); + } + + } + + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(-FLT_MAX, vlmax); + + v_res = VFREDMAXVS_FLOAT(v_max, v_res, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + mask = VMFGEVF_FLOAT(v_max, maxf, vlmax); + max_index = VFIRSTM(mask, vlmax); + + v_max_index = VSLIDEDOWN_UINT(v_max_index, max_index, vlmax); + max_index = VMVVXS_UINT(v_max_index); + + return(max_index+1); +} diff --git a/kernel/riscv64/imax_vector.c b/kernel/riscv64/imax_vector.c index ca48a3c48..308fa15a4 100644 --- a/kernel/riscv64/imax_vector.c +++ b/kernel/riscv64/imax_vector.c @@ -31,68 +31,80 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) -#define ABS fabs -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n) #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMAXVS_FLOAT(va, vb, gvl) vfredmax_vs_f64m8_f64m1(v_res, va, vb, gvl) +#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_m) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_m) +#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl) +#else +#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f64m8_f64m1) +#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_mu) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_mu) +#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8) +#endif #define MASK_T vbool8_t -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT vfmax_vv_f64m8 -#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 -#define VMFIRSTM vmfirst_m_b8 +#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f64m8_b8) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f64m8) +#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f64m8_b8) +#define VMFIRSTM RISCV_RVV(vfirst_m_b8) #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VMVVX_UINT vmv_v_x_u64m8 +#define VIDV_UINT RISCV_RVV(vid_v_u64m8) +#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8) +#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8) +#define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64) #else -#define ABS fabsf -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n) #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMAXVS_FLOAT(va, vb, gvl) vfredmax_vs_f32m8_f32m1(v_res, va, vb, gvl) +#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_m) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_m) +#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl) +#else +#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f32m8_f32m1) +#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_mu) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_mu) +#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8) +#endif #define MASK_T vbool4_t -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT vfmax_vv_f32m8 -#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 -#define VMFIRSTM vmfirst_m_b4 +#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f32m8_b4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f32m8) +#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f32m8_b4) +#define VMFIRSTM RISCV_RVV(vfirst_m_b4) #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VMVVX_UINT vmv_v_x_u32m8 +#define VIDV_UINT RISCV_RVV(vid_v_u32m8) +#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8) +#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8) +#define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32) #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; + BLASLONG i=0, j=0; unsigned int max_index = 0; - if (n <= 0 || inc_x <= 0) return(max_index); - FLOAT maxf=-FLT_MAX; + if (n <= 0 || inc_x <= 0) return(max_index); + FLOAT maxf=-FLT_MAX; FLOAT_V_T vx, v_max; UINT_V_T v_max_index; MASK_T mask; unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_min; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1); if(inc_x == 1){ gvl = VSETVL(n); @@ -104,32 +116,34 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; } - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + maxf = EXTRACT_FLOAT(v_res); mask = VMFGEVF_FLOAT(v_max, maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); v_max = VLEV_FLOAT(&x[j], gvl); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); - FLOAT cur_maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + FLOAT cur_maxf = EXTRACT_FLOAT(v_res); if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); v_max_index = VADDVX_UINT(v_max_index, j, gvl); mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); } } }else{ @@ -145,37 +159,37 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; idx += inc_v; } - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + maxf = EXTRACT_FLOAT(v_res); mask = VMFGEVF_FLOAT(v_max, maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); - FLOAT cur_maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + FLOAT cur_maxf = EXTRACT_FLOAT(v_res); if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); v_max_index = VADDVX_UINT(v_max_index, j, gvl); mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); } } } - return(max_index+1); + return(max_index+1); } - - diff --git a/kernel/riscv64/imin_rvv.c b/kernel/riscv64/imin_rvv.c new file mode 100644 index 000000000..1de7f3233 --- /dev/null +++ b/kernel/riscv64/imin_rvv.c @@ -0,0 +1,146 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m8_tu +#define VFIRSTM __riscv_vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m8_tumu +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m8_tumu +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u64m8_u64 +#else +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m8_tu +#define VFIRSTM __riscv_vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m8_tumu +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m8_tumu +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u32m8_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int min_index = 0; + if (n <= 0 || inc_x <= 0) return(min_index); + + FLOAT_V_T vx, v_min; + UINT_V_T v_min_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_min_index = VMVVX_UINT(0, vlmax); + v_min = VFMVVF_FLOAT(FLT_MAX, vlmax); + BLASLONG j=0; + FLOAT minf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, j += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, vl); + v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, vl); + v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl); + } + + } + + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); + + v_res = VFREDMINVS_FLOAT(v_min, v_res, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + mask = VMFLEVF_FLOAT(v_min, minf, vlmax); + min_index = VFIRSTM(mask, vlmax); + + v_min_index = VSLIDEDOWN_UINT(v_min_index, min_index, vlmax); + min_index = VMVVXS_UINT(v_min_index); + + return(min_index+1); +} diff --git a/kernel/riscv64/imin_vector.c b/kernel/riscv64/imin_vector.c index 2a677098d..ec36b8eb9 100644 --- a/kernel/riscv64/imin_vector.c +++ b/kernel/riscv64/imin_vector.c @@ -31,122 +31,119 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) -#define ABS fabs -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n) #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMINVS_FLOAT(va, vb, gvl) vfredmin_vs_f64m8_f64m1(v_res, va, vb, gvl) +#define VIDV_MASK_UINT(mask, gvl) RISCV_RVV(vid_v_u64m8_m)(mask, v_min_index, gvl) +#define VADDVX_MASK_UINT(mask, a, b, gvl) RISCV_RVV(vadd_vx_u64m8_m)(mask, a, a, b, gvl) +#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl) +#else +#define VFREDMINVS_FLOAT RISCV_RVV(vfredmin_vs_f64m8_f64m1) +#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_m) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_m) +#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8) +#endif #define MASK_T vbool8_t -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMINVV_FLOAT vfmin_vv_f64m8 -#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 -#define VMFIRSTM vmfirst_m_b8 +#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f64m8_b8) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f64m8) +#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f64m8_b8) +#define VMFIRSTM RISCV_RVV(vfirst_m_b8) #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VMVVX_UINT vmv_v_x_u64m8 +#define VIDV_UINT RISCV_RVV(vid_v_u64m8) +#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8) +#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8) +#define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64) #else -#define ABS fabsf -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n) #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMINVS_FLOAT(va, vb, gvl) vfredmin_vs_f32m8_f32m1(v_res, va, vb, gvl) +#define VIDV_MASK_UINT(mask, gvl) RISCV_RVV(vid_v_u32m8_m)(mask, v_min_index, gvl) +#define VADDVX_MASK_UINT(mask, a, b, gvl) RISCV_RVV(vadd_vx_u32m8_m)(mask, a, a, b, gvl) +#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl) +#else +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m +#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8) +#endif #define MASK_T vbool4_t -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMINVV_FLOAT vfmin_vv_f32m8 -#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 -#define VMFIRSTM vmfirst_m_b4 +#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f32m8_b4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f32m8) +#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f32m8_b4) +#define VMFIRSTM RISCV_RVV(vfirst_m_b4) #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VMVVX_UINT vmv_v_x_u32m8 +#define VIDV_UINT RISCV_RVV(vid_v_u32m8) +#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8) +#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8) +#define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32) #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; - FLOAT minf=FLT_MAX; + BLASLONG i=0, j=0; unsigned int min_index = 0; - if (n <= 0 || inc_x <= 0) return(min_index); + if (n <= 0 || inc_x <= 0) return(min_index); + FLOAT minf=FLT_MAX; FLOAT_V_T vx, v_min; UINT_V_T v_min_index; MASK_T mask; unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_max; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); if(inc_x == 1){ gvl = VSETVL(n); - v_min = VFMVVF_FLOAT(FLT_MAX, gvl); v_min_index = VMVVX_UINT(0, gvl); + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLEV_FLOAT(&x[j], gvl); - //index where element less than v_min - mask = VMFLTVV_FLOAT(vx, v_min, gvl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e64,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_min_index) - :"v"(mask), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e32,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_min_index) - :"v"(mask), "r"(gvl) - :"v0"); -#endif -*/ - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl); + + //index where element greater than v_min + mask = VMFGTVV_FLOAT(v_min, vx, gvl); + v_min_index = VIDV_MASK_UINT(mask, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; } - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - minf = *((FLOAT*)&v_res); + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + minf = EXTRACT_FLOAT(v_res); mask = VMFLEVF_FLOAT(v_min, minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); v_min = VLEV_FLOAT(&x[j], gvl); - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - FLOAT cur_minf = *((FLOAT*)&v_res); - if(cur_minf < minf){ + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + FLOAT cur_minf = EXTRACT_FLOAT(v_res); + if(cur_minf > minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); + mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); } } }else{ @@ -159,59 +156,39 @@ asm volatile( for(i=0,j=0; i < n/gvl; i++){ vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); - //index where element less than v_min - mask = VMFLTVV_FLOAT(vx, v_min, gvl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e64,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_min_index) - :"v"(mask), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e32,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_min_index) - :"v"(mask), "r"(gvl) - :"v0"); -#endif -*/ - - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl); + //index where element greater than v_min + mask = VMFGTVV_FLOAT(v_min, vx, gvl); + v_min_index = VIDV_MASK_UINT(mask, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; idx += inc_v; } - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - minf = *((FLOAT*)&v_res); + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + minf = EXTRACT_FLOAT(v_res); mask = VMFLEVF_FLOAT(v_min, minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); - - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - FLOAT cur_minf = *((FLOAT*)&v_res); - if(cur_minf < minf){ + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + FLOAT cur_minf = EXTRACT_FLOAT(v_res); + if(cur_minf > minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); + mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); } } } - return(min_index+1); + return(min_index+1); } - - diff --git a/kernel/riscv64/izamax_rvv.c b/kernel/riscv64/izamax_rvv.c new file mode 100644 index 000000000..32f66a7a7 --- /dev/null +++ b/kernel/riscv64/izamax_rvv.c @@ -0,0 +1,172 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1 +#define MASK_T vbool16_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m4_b16 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m4_b16 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m4_b16 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m4_tu +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 +#define VFIRSTM __riscv_vfirst_m_b16 +#define UINT_V_T vuint64m4_t +#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m4_tumu +#define VIDV_UINT __riscv_vid_v_u64m4 +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m4_tumu +#define VADDVX_UINT __riscv_vadd_vx_u64m4 +#define VMVVX_UINT __riscv_vmv_v_x_u64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m4 +#define VMVVXS_UINT __riscv_vmv_x_s_u64m4_u64 +#else +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m4_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m4_b8 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m4_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m4_tu +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 +#define VFIRSTM __riscv_vfirst_m_b8 +#define UINT_V_T vuint32m4_t +#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m4_tumu +#define VIDV_UINT __riscv_vid_v_u32m4 +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m4_tumu +#define VADDVX_UINT __riscv_vadd_vx_u32m4 +#define VMVVX_UINT __riscv_vmv_v_x_u32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m4 +#define VMVVXS_UINT __riscv_vmv_x_s_u32m4_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int max_index = 0; + if (n <= 0 || inc_x <= 0) return(max_index); + + FLOAT_V_T vx0, vx1, v_max; + FLOAT_VX2_T vxx2; + UINT_V_T v_max_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_max_index = VMVVX_UINT(0, vlmax); + v_max = VFMVVF_FLOAT(-1, vlmax); + BLASLONG j=0; + FLOAT maxf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) { + vl = VSETVL(n); + + vxx2 = VLSEG_FLOAT(x, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + + vx0 = VFABSV_FLOAT(vx0, vl); + vx1 = VFABSV_FLOAT(vx1, vl); + + vx0 = VFADDVV_FLOAT(vx0, vx1, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx0, vl); + v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx0, vl); + } + } + else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) { + vl = VSETVL(n); + + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + + vx0 = VFABSV_FLOAT(vx0, vl); + vx1 = VFABSV_FLOAT(vx1, vl); + + vx0 = VFADDVV_FLOAT(vx0, vx1, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx0, vl); + v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx0, vl); + } + + } + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + + v_res = VFREDMAXVS_FLOAT(v_max, v_res, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + mask = VMFGEVF_FLOAT(v_max, maxf, vlmax); + max_index = VFIRSTM(mask, vlmax); + + v_max_index = VSLIDEDOWN_UINT(v_max_index, max_index, vlmax); + max_index = VMVVXS_UINT(v_max_index); + + return(max_index+1); +} diff --git a/kernel/riscv64/izamax_vector.c b/kernel/riscv64/izamax_vector.c index 66a101566..d33e89c00 100644 --- a/kernel/riscv64/izamax_vector.c +++ b/kernel/riscv64/izamax_vector.c @@ -27,241 +27,146 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include +#include #if defined(DOUBLE) -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n) #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMAXVS_FLOAT(va, vb, gvl) RISCV_RVV(vfredmax_vs_f64m8_f64m1)(v_res, va, vb, gvl) +#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_m) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_m) +#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl) +#else +#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f64m8_f64m1) +#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_mu) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_mu) +#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8) +#endif #define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMAXVV_FLOAT vfmax_vv_f64m8 -#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 -#define VMFIRSTM vmfirst_m_b8 +#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f64m8_b8) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f64m8) +#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f64m8_b8) +#define VMFIRSTM RISCV_RVV(vfirst_m_b8) #define UINT_V_T vuint64m8_t -#define VSEVU_UINT vse64_v_u64m8 +#define VSEVU_UINT RISCV_RVV(vse64_v_u64m8) #define UINT_T long unsigned int -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VFADDVV_FLOAT vfadd_vv_f64m8 -#define VMVVX_UINT vmv_v_x_u64m8 +#define VIDV_UINT RISCV_RVV(vid_v_u64m8) +#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8) +#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8) +#define VFABS_FLOAT RISCV_RVV(vfabs_v_f64m8) +#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f64m8) +#define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64) #else -#define ABS fabsf -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n) #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMAXVS_FLOAT(va, vb, gvl) RISCV_RVV(vfredmax_vs_f32m8_f32m1)(v_res, va, vb, gvl) +#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_m) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_m) +#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl) +#else +#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f32m8_f32m1) +#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_mu) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_mu) +#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8) +#endif #define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMAXVV_FLOAT vfmax_vv_f32m8 -#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 -#define VMFIRSTM vmfirst_m_b4 +#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f32m8_b4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f32m8) +#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f32m8_b4) +#define VMFIRSTM RISCV_RVV(vfirst_m_b4) #define UINT_V_T vuint32m8_t #define UINT_T unsigned int -#define VSEVU_UINT vse32_v_u32m8 -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VFADDVV_FLOAT vfadd_vv_f32m8 -#define VMVVX_UINT vmv_v_x_u32m8 +#define VSEVU_UINT RISCV_RVV(vse32_v_u32m8) +#define VIDV_UINT RISCV_RVV(vid_v_u32m8) +#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8) +#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8) +#define VFABS_FLOAT RISCV_RVV(vfabs_v_f32m8) +#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f32m8) +#define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32) #endif -#define RVV_M RVV_M8 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; - FLOAT maxf=0.0; + BLASLONG i=0, j=0; unsigned int max_index = 0; - if (n <= 0 || inc_x <= 0) return(max_index); + if (n <= 0 || inc_x <= 0) return(max_index); + FLOAT maxf=-FLT_MAX; - FLOAT_V_T vx0, vx1, v_max; + FLOAT_V_T vx, vx2, v_max; UINT_V_T v_max_index; - MASK_T mask0, mask1; + MASK_T mask; unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_z0; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1); gvl = VSETVL(n); - UINT_T temp_uint[gvl]; + unsigned int stride_x = inc_x * 2 * sizeof(FLOAT); + unsigned int idx = 0, inc_v = gvl * inc_x * 2; + + v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); v_max_index = VMVVX_UINT(0, gvl); - v_max = VFMVVF_FLOAT(-1, gvl); - BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); - BLASLONG inc_xv = gvl * inc_x * 2; - BLASLONG ix = 0; for(i=0,j=0; i < n/gvl; i++){ - vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - //fabs(vector) - mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - //fabs(vector) - mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - vx0 = VFADDVV_FLOAT(vx0, vx1, gvl); + vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); + vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); + vx = VFABS_FLOAT(vx, gvl); + vx2 = VFABS_FLOAT(vx2, gvl); + vx = VFADDVV_FLOAT(vx, vx2, gvl); + //index where element greater than v_max - mask0 = VMFLTVV_FLOAT(v_max, vx0, gvl); - v_max_index = VIDV_MASK_UINT(mask0, v_max_index, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e64,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_max_index) - :"v"(mask0), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e32,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_max_index) - :"v"(mask0), "r"(gvl) - :"v0"); -#endif -*/ - v_max_index = VADDVX_MASK_UINT(mask0, v_max_index, v_max_index, j, gvl); + mask = VMFLTVV_FLOAT(v_max, vx, gvl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); //update v_max and start_index j - v_max = VFMAXVV_FLOAT(v_max, vx0, gvl); + v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; - ix += inc_xv; + idx += inc_v; } - vx0 = VFMVVF_FLOAT(0, gvl); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - maxf = VFMVFS_FLOAT(v_res); - mask0 = VMFGEVF_FLOAT(v_max, maxf, gvl); - max_index = VMFIRSTM(mask0,gvl); - VSEVU_UINT(temp_uint,v_max_index,gvl); - max_index = temp_uint[max_index]; - + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + maxf = EXTRACT_FLOAT(v_res); + mask = VMFGEVF_FLOAT(v_max, maxf, gvl); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); - v_max_index = VMVVX_UINT(0, gvl); - vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - //fabs(vector) - mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - //fabs(vector) - mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - v_max = VFADDVV_FLOAT(vx0, vx1, gvl); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - FLOAT cur_maxf = VFMVFS_FLOAT(v_res); + v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); + vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); + v_max = VFABS_FLOAT(v_max, gvl); + vx2 = VFABS_FLOAT(vx2, gvl); + v_max = VFADDVV_FLOAT(v_max, vx2, gvl); + + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + FLOAT cur_maxf = EXTRACT_FLOAT(v_res); + if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); v_max_index = VADDVX_UINT(v_max_index, j, gvl); - mask0 = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); - max_index = VMFIRSTM(mask0,gvl); - VSEVU_UINT(temp_uint,v_max_index,gvl); - max_index = temp_uint[max_index]; - + mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); } } - return(max_index+1); -} - + return(max_index+1); +} diff --git a/kernel/riscv64/izamin_rvv.c b/kernel/riscv64/izamin_rvv.c new file mode 100644 index 000000000..d34b220fa --- /dev/null +++ b/kernel/riscv64/izamin_rvv.c @@ -0,0 +1,171 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m4_f64m1 +#define MASK_T vbool16_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m4_b16 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m4_b16 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m4_b16 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m4_tu +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 +#define VFIRSTM __riscv_vfirst_m_b16 +#define UINT_V_T vuint64m4_t +#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m4_tumu +#define VIDV_UINT __riscv_vid_v_u64m4 +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m4_tumu +#define VADDVX_UINT __riscv_vadd_vx_u64m4 +#define VMVVX_UINT __riscv_vmv_v_x_u64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m4 +#define VMVVXS_UINT __riscv_vmv_x_s_u64m4_u64 +#else +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m4_f32m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m4_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m4_b8 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m4_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m4_tu +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 +#define VFIRSTM __riscv_vfirst_m_b8 +#define UINT_V_T vuint32m4_t +#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m4_tumu +#define VIDV_UINT __riscv_vid_v_u32m4 +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m4_tumu +#define VADDVX_UINT __riscv_vadd_vx_u32m4 +#define VMVVX_UINT __riscv_vmv_v_x_u32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m4 +#define VMVVXS_UINT __riscv_vmv_x_s_u32m4_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int min_index = 0; + if (n <= 0 || inc_x <= 0) return(min_index); + + FLOAT_V_T vx0, vx1, v_min; + FLOAT_VX2_T vxx2; + UINT_V_T v_min_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_min_index = VMVVX_UINT(0, vlmax); + v_min = VFMVVF_FLOAT(FLT_MAX, vlmax); + BLASLONG j=0; + FLOAT minf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) { + vl = VSETVL(n); + + vxx2 = VLSEG_FLOAT(x, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + + vx0 = VFABSV_FLOAT(vx0, vl); + vx1 = VFABSV_FLOAT(vx1, vl); + + vx0 = VFADDVV_FLOAT(vx0, vx1, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx0, v_min, vl); + v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx0, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) { + vl = VSETVL(n); + + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + + vx0 = VFABSV_FLOAT(vx0, vl); + vx1 = VFABSV_FLOAT(vx1, vl); + + vx0 = VFADDVV_FLOAT(vx0, vx1, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx0, v_min, vl); + v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx0, vl); + } + + } + + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); + + v_res = VFREDMINVS_FLOAT(v_min, v_res, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + mask = VMFLEVF_FLOAT(v_min, minf, vlmax); + min_index = VFIRSTM(mask, vlmax); + + v_min_index = VSLIDEDOWN_UINT(v_min_index, min_index, vlmax); + min_index = VMVVXS_UINT(v_min_index); + + return(min_index+1); +} diff --git a/kernel/riscv64/izamin_vector.c b/kernel/riscv64/izamin_vector.c index 818193a9e..c76a38099 100644 --- a/kernel/riscv64/izamin_vector.c +++ b/kernel/riscv64/izamin_vector.c @@ -31,235 +31,142 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n) #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMINVS_FLOAT(va, vb, gvl) RISCV_RVV(vfredmin_vs_f64m8_f64m1)(v_res, va, vb, gvl) +#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_m) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_m) +#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl) +#else +#define VFREDMINVS_FLOAT RISCV_RVV(vfredmin_vs_f64m8_f64m1) +#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_mu) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_mu) +#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8) +#endif #define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMINVV_FLOAT vfmin_vv_f64m8 -#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 -#define VMFIRSTM vmfirst_m_b8 +#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f64m8_b8) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f64m8) +#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f64m8_b8) +#define VMFIRSTM RISCV_RVV(vfirst_m_b8) #define UINT_V_T vuint64m8_t #define VSEVU_UINT vse64_v_u64m8 #define UINT_T long unsigned int -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VFADDVV_FLOAT vfadd_vv_f64m8 -#define VMVVX_UINT vmv_v_x_u64m8 +#define VIDV_UINT RISCV_RVV(vid_v_u64m8) +#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8) +#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8) +#define VFABS_FLOAT RISCV_RVV(vfabs_v_f64m8) +#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f64m8) +#define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64) #else -#define ABS fabsf -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n) #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMINVS_FLOAT(va, vb, gvl) RISCV_RVV(vfredmin_vs_f32m8_f32m1)(v_res, va, vb, gvl) +#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_m) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_m) +#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl) +#else +#define VFREDMINVS_FLOAT RISCV_RVV(vfredmin_vs_f32m8_f32m1) +#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_mu) +#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_mu) +#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8) +#endif #define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMINVV_FLOAT vfmin_vv_f32m8 -#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 -#define VMFIRSTM vmfirst_m_b4 +#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f32m8_b4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f32m8) +#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f32m8_b4) +#define VMFIRSTM RISCV_RVV(vfirst_m_b4) #define UINT_V_T vuint32m8_t #define UINT_T unsigned int -#define VSEVU_UINT vse32_v_u32m8 -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VFADDVV_FLOAT vfadd_vv_f32m8 -#define VMVVX_UINT vmv_v_x_u32m8 +#define VSEVU_UINT RISCV_RVV(vse32_v_u32m8) +#define VIDV_UINT RISCV_RVV(vid_v_u32m8) +#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8) +#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8) +#define VFABS_FLOAT RISCV_RVV(vfabs_v_f32m8) +#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f32m8) +#define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32) #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; - FLOAT minf=FLT_MAX; + BLASLONG i=0, j=0; unsigned int min_index = 0; - if (n <= 0 || inc_x <= 0) return(min_index); + if (n <= 0 || inc_x <= 0) return(min_index); + FLOAT minf=FLT_MAX; - FLOAT_V_T vx0, vx1, v_min; + FLOAT_V_T vx, vx2, v_min; UINT_V_T v_min_index; - MASK_T mask0, mask1; + MASK_T mask; unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_max; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); gvl = VSETVL(n); - UINT_T temp_uint[gvl]; - v_min_index = VMVVX_UINT(0, gvl); + unsigned int stride_x = inc_x * 2 * sizeof(FLOAT); + unsigned int idx = 0, inc_v = gvl * inc_x * 2; + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); - BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); - BLASLONG inc_xv = gvl * inc_x * 2; - BLASLONG ix = 0; + v_min_index = VMVVX_UINT(0, gvl); for(i=0,j=0; i < n/gvl; i++){ - vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - //fabs(vector) - mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - //fabs(vector) - mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - vx0 = VFADDVV_FLOAT(vx0, vx1, gvl); + vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); + vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); + vx = VFABS_FLOAT(vx, gvl); + vx2 = VFABS_FLOAT(vx2, gvl); + vx = VFADDVV_FLOAT(vx, vx2, gvl); - //index where element less than v_min - mask0 = VMFLTVV_FLOAT(vx0, v_min, gvl); - v_min_index = VIDV_MASK_UINT(mask0, v_min_index, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e64,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_min_index) - :"v"(mask0), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e32,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_min_index) - :"v"(mask0), "r"(gvl) - :"v0"); -#endif -*/ - v_min_index = VADDVX_MASK_UINT(mask0, v_min_index, v_min_index, j, gvl); + + //index where element greater than v_min + mask = VMFGTVV_FLOAT(v_min, vx, gvl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); //update v_min and start_index j - v_min = VFMINVV_FLOAT(v_min, vx0, gvl); + v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; - ix += inc_xv; + idx += inc_v; } - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - minf = VFMVFS_FLOAT(v_res); - mask0 = VMFLEVF_FLOAT(v_min, minf, gvl); - min_index = VMFIRSTM(mask0,gvl); - VSEVU_UINT(temp_uint,v_min_index,gvl); - min_index = temp_uint[min_index]; + + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + minf = EXTRACT_FLOAT(v_res); + mask = VMFLEVF_FLOAT(v_min, minf, gvl); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); - v_min_index = VMVVX_UINT(0, gvl); - vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - //fabs(vector) - mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - //fabs(vector) - mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - v_min = VFADDVV_FLOAT(vx0, vx1, gvl); - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - FLOAT cur_minf = VFMVFS_FLOAT(v_res); + v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); + vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); + v_min = VFABS_FLOAT(v_min, gvl); + vx2 = VFABS_FLOAT(vx2, gvl); + v_min = VFADDVV_FLOAT(v_min, vx2, gvl); + + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + FLOAT cur_minf = EXTRACT_FLOAT(v_res); if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); - mask0 = VMFLEVF_FLOAT(v_min, cur_minf, gvl); - min_index = VMFIRSTM(mask0,gvl); - VSEVU_UINT(temp_uint,v_min_index,gvl); - min_index = temp_uint[min_index]; - + mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); } } - return(min_index+1); -} - + return(min_index+1); +} diff --git a/kernel/riscv64/max_rvv.c b/kernel/riscv64/max_rvv.c new file mode 100644 index 000000000..745c27bf4 --- /dev/null +++ b/kernel/riscv64/max_rvv.c @@ -0,0 +1,98 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m8_tu +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m8_tu +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT maxf = 0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + FLOAT_V_T vx, vmax; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(-FLT_MAX, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmax = VFMVVF_FLOAT(-FLT_MAX, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl); + } + + } + + v_res = VFREDMAXVS_FLOAT(vmax, v_res, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + + return(maxf); +} diff --git a/kernel/riscv64/max_vector.c b/kernel/riscv64/max_vector.c index 7f31e9a53..ee9920cd2 100644 --- a/kernel/riscv64/max_vector.c +++ b/kernel/riscv64/max_vector.c @@ -28,29 +28,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include #include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT vfmax_vv_f32m8 + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 32 +# else +# define ELEN 32 +# define MLEN 16 +# endif +#else +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 8 +# else +# define ELEN 32 +# define MLEN 4 +# endif +#endif + +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMAXVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT vfmax_vv_f64m8 +#define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) #endif +#define MASK_T JOIN(vbool, MLEN, _t, _, _) +#define VMFLTVF_FLOAT JOIN(RISCV_RVV(vmflt_vf_f), ELEN, LMUL, _b, MLEN) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) +#define VFMAXVV_FLOAT JOIN(RISCV_RVV(vfmax), _vv_f, ELEN, LMUL, _) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { @@ -59,10 +77,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT maxf=-FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_max; - FLOAT_V_T_M1 v_res, v_min; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1); if(inc_x == 1){ gvl = VSETVL(n); @@ -76,15 +92,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_max = VFMAXVV_FLOAT(v_max, v1, gvl); j += gvl * 2; } - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); } for(;j maxf) - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); j += gvl; } }else{ @@ -102,18 +115,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) j += gvl * 2; idx += inc_xv * 2; } - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); } for(;j maxf) - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); j += gvl; } } + maxf = EXTRACT_FLOAT(v_res); return(maxf); } diff --git a/kernel/riscv64/min_rvv.c b/kernel/riscv64/min_rvv.c new file mode 100644 index 000000000..78528fef9 --- /dev/null +++ b/kernel/riscv64/min_rvv.c @@ -0,0 +1,98 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m8_tu +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m8_tu +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + FLOAT_V_T vx, vmin; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmin = VFMVVF_FLOAT(FLT_MAX, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl); + } + + } + + v_res = VFREDMINVS_FLOAT(vmin, v_res, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + + return(minf); +} diff --git a/kernel/riscv64/min_vector.c b/kernel/riscv64/min_vector.c index 14b7e01ed..2001840bb 100644 --- a/kernel/riscv64/min_vector.c +++ b/kernel/riscv64/min_vector.c @@ -28,29 +28,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include #include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMINVV_FLOAT vfmin_vv_f32m8 + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 32 +# else +# define ELEN 32 +# define MLEN 16 +# endif +#else +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 8 +# else +# define ELEN 32 +# define MLEN 4 +# endif +#endif + +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMINVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VFREDMINVS_FLOAT JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) #endif +#define MASK_T JOIN(vbool, MLEN, _t, _, _) +#define VMFLTVF_FLOAT JOIN(RISCV_RVV(vmflt_vf_f), ELEN, LMUL, _b, MLEN) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) +#define VFMINVV_FLOAT JOIN(RISCV_RVV(vfmin), _vv_f, ELEN, LMUL, _) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { @@ -59,10 +77,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT minf=FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_min; - FLOAT_V_T_M1 v_res, v_max; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); if(inc_x == 1){ gvl = VSETVL(n); @@ -76,15 +92,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_min = VFMINVV_FLOAT(v_min, v1, gvl); j += gvl * 2; } - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - minf = *((FLOAT*)&v_res); + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); } for(;j= gvl ) // don't pay overheads if we're not doing useful work + { + for(i=0; i= gvl ) // don't pay overheads if we're not doing useful work + { + for(i=0; i 0; n -= vl, x += vl, y += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLEV_FLOAT(y, vl); + + v0 = VFMULVF_FLOAT(vx, c, vl); + v0 = VFMACCVF_FLOAT(v0, s, vy, vl); + VSEV_FLOAT(x, v0, vl); + + v1 = VFMULVF_FLOAT(vx, s, vl); + v1 = VFMSACVF_FLOAT(v1, c, vy, vl); + VSEV_FLOAT(y, v1, vl); + } + + } else if(inc_y == 1) { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLEV_FLOAT(y, vl); + + v0 = VFMULVF_FLOAT(vx, c, vl); + v0 = VFMACCVF_FLOAT(v0, s, vy, vl); + VSSEV_FLOAT(x, stride_x, v0, vl); + + v1 = VFMULVF_FLOAT(vx, s, vl); + v1 = VFMSACVF_FLOAT(v1, c, vy, vl); + VSEV_FLOAT(y, v1, vl); + } + + } else if(inc_x == 1) { + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + + v0 = VFMULVF_FLOAT(vx, c, vl); + v0 = VFMACCVF_FLOAT(v0, s, vy, vl); + VSEV_FLOAT(x, v0, vl); + + v1 = VFMULVF_FLOAT(vx, s, vl); + v1 = VFMSACVF_FLOAT(v1, c, vy, vl); + VSSEV_FLOAT(y, stride_y, v1, vl); + } + + } else { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + + v0 = VFMULVF_FLOAT(vx, c, vl); + v0 = VFMACCVF_FLOAT(v0, s, vy, vl); + VSSEV_FLOAT(x, stride_x, v0, vl); + + v1 = VFMULVF_FLOAT(vx, s, vl); + v1 = VFMSACVF_FLOAT(v1, c, vy, vl); + VSSEV_FLOAT(y, stride_y, v1, vl); + } + + } + + return(0); +} diff --git a/kernel/riscv64/rot_vector.c b/kernel/riscv64/rot_vector.c index f3786e1d0..649d9bb94 100644 --- a/kernel/riscv64/rot_vector.c +++ b/kernel/riscv64/rot_vector.c @@ -28,27 +28,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle32_v_f32m4 -#define VLSEV_FLOAT vlse32_v_f32m4 -#define VSEV_FLOAT vse32_v_f32m4 -#define VSSEV_FLOAT vsse32_v_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMULVF_FLOAT vfmul_vf_f32m4 -#define VFMSACVF_FLOAT vfmsac_vf_f32m4 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4) +#define VFMSACVF_FLOAT RISCV_RVV(vfmsac_vf_f32m4) #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle64_v_f64m4 -#define VLSEV_FLOAT vlse64_v_f64m4 -#define VSEV_FLOAT vse64_v_f64m4 -#define VSSEV_FLOAT vsse64_v_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMULVF_FLOAT vfmul_vf_f64m4 -#define VFMSACVF_FLOAT vfmsac_vf_f64m4 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4) +#define VFMSACVF_FLOAT RISCV_RVV(vfmsac_vf_f64m4) #endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) @@ -57,11 +57,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT BLASLONG ix=0,iy=0; if(n <= 0) return(0); - unsigned int gvl = 0; + unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1); FLOAT_V_T v0, v1, vx, vy; if(inc_x == 1 && inc_y == 1){ - gvl = VSETVL(n); for(i=0,j=0; i 0; n -= vl, x += vl) { + vl = VSETVL(n); + VSEV_FLOAT(x, v0, vl); + } + } + else { + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + v0 = VLEV_FLOAT(x, vl); + v0 = VFMULVF_FLOAT(v0, da, vl); + VSEV_FLOAT(x, v0, vl); + } + } + } else { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + if(da == 0.0) { + int gvl = VSETVL_MAX; + v0 = VFMVVF_FLOAT(0.0, gvl); + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + VSSEV_FLOAT(x, stride_x, v0, vl); + } + } + else { + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + v0 = VLSEV_FLOAT(x, stride_x, vl); + v0 = VFMULVF_FLOAT(v0, da, vl); + VSSEV_FLOAT(x, stride_x, v0, vl); + } + } + } + + return 0; +} diff --git a/kernel/riscv64/scal_vector.c b/kernel/riscv64/scal_vector.c index 8b9ef5a3e..8fa9315f6 100644 --- a/kernel/riscv64/scal_vector.c +++ b/kernel/riscv64/scal_vector.c @@ -26,28 +26,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 -#define VFMULVF_FLOAT vfmul_vf_f32m8 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 32 +# else +# define ELEN 32 +# define MLEN 16 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 -#define VFMULVF_FLOAT vfmul_vf_f64m8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 8 +# else +# define ELEN 32 +# define MLEN 4 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) +#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMULVF_FLOAT JOIN(RISCV_RVV(vfmul), _vf_f, ELEN, LMUL, _) + int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0,j=0; @@ -84,25 +97,25 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS } }else{ if(da == 0.0){ + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG ix = 0; gvl = VSETVL(n); - BLASLONG stride_x = inc_x * sizeof(FLOAT); - BLASLONG ix = 0; - if(gvl <= n / 2){ - long int inc_xv = gvl * inc_x; - v0 = VFMVVF_FLOAT(0, gvl); - for(i = 0, j = 0; i < n/(2*gvl); i++, j+=2*gvl){ - VSSEV_FLOAT(&x[ix], stride_x, v0, gvl); - VSSEV_FLOAT(&x[ix + inc_xv], stride_x, v0, gvl); - ix += inc_xv * 2; - } + v0 = VFMVVF_FLOAT(0, gvl); + + for(i = 0; i < n/(gvl*2); ++i ){ + VSSEV_FLOAT(&x[ix], stride_x, v0, gvl); + ix += inc_x * gvl; + VSSEV_FLOAT(&x[ix], stride_x, v0, gvl); + ix += inc_x * gvl; } - //tail - for(; j 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl); + } + + } + + v_res = VFREDSUMVS_FLOAT(vsum, v_res, vlmax); + sumf = VFMVFS_FLOAT_M1(v_res); + return(sumf); +} diff --git a/kernel/riscv64/sum_vector.c b/kernel/riscv64/sum_vector.c new file mode 100644 index 000000000..cf734faab --- /dev/null +++ b/kernel/riscv64/sum_vector.c @@ -0,0 +1,114 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) +#define VFREDSUMVS_FLOAT RISCV_RVV(vfredusum_vs_f32m8_f32m1) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f32m8) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8) +#define VFREDSUMVS_FLOAT RISCV_RVV(vfredusum_vs_f64m8_f64m1) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f64m8) +#endif +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + BLASLONG ix=0; + FLOAT asumf=0.0; + if (n <= 0 || inc_x <= 0) return(asumf); + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_sum; + FLOAT_V_T_M1 v_res; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + + if(inc_x == 1){ + gvl = VSETVL(n); + if(gvl <= n/2){ + v_sum = VFMVVF_FLOAT(0, gvl); + for(i=0,j=0; i 0; m -= vl, ptr -= vl*inc_y) { + vl = VSETVL(m); + vy = VLSEV_FLOAT(ptr - 1, stride_y, vl); + VSSEV_FLOAT(ptr, stride_y, vy, vl); + } + y[0] = temp; + } + else if(inc_y == 0) { + FLOAT temp = y[0]; + y[0] = x[(n - 1) * inc_x]; + FLOAT* ptr = x + (n - 1) * inc_x; // start from the last one + stride_x = (0 - inc_x) * sizeof(FLOAT); // reverse + BLASLONG m = n - 1; + for (size_t vl; m > 0; m -= vl, ptr -= vl*inc_x) { + vl = VSETVL(m); + vx = VLSEV_FLOAT(ptr - 1, stride_x, vl); + VSSEV_FLOAT(ptr, stride_x, vx, vl); + } + x[0] = temp; + } + else if(inc_x == 1 && inc_y == 1) { + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLEV_FLOAT(y, vl); + VSEV_FLOAT(y, vx, vl); + VSEV_FLOAT(x, vy, vl); + } + + } else if (inc_y == 1) { + stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLEV_FLOAT(y, vl); + VSEV_FLOAT(y, vx, vl); + VSSEV_FLOAT(x, stride_x, vy, vl); + } + + } else if(inc_x == 1) { + stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + VSSEV_FLOAT(y, stride_y, vx, vl); + VSEV_FLOAT(x, vy, vl); + } + + } else { + stride_x = inc_x * sizeof(FLOAT); + stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + VSSEV_FLOAT(y, stride_y, vx, vl); + VSSEV_FLOAT(x, stride_x, vy, vl); + } + } + + return(0); +} diff --git a/kernel/riscv64/swap_vector.c b/kernel/riscv64/swap_vector.c index 82fa5ce31..f583f5392 100644 --- a/kernel/riscv64/swap_vector.c +++ b/kernel/riscv64/swap_vector.c @@ -27,35 +27,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 32 +# else +# define ELEN 32 +# define MLEN 16 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 8 +# else +# define ELEN 32 +# define MLEN 4 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) +#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) + int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i = 0, j = 0; BLASLONG ix = 0,iy = 0; BLASLONG stride_x, stride_y; FLOAT_V_T vx0, vx1, vy0, vy1; - unsigned int gvl = 0; - if (n < 0) return(0); + if (n <= 0) return(0); + + unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1); + if( inc_x == 0 && inc_y == 0 ) { n = n & 1; } + if(inc_x == 1 && inc_y == 1){ - gvl = VSETVL(n); if(gvl <= n/2){ for(i=0,j=0; i 0; js -= vl, posX += vl) { + vl = VSETVL(js); + offset = posX - posY; + + ao1 = a + posX + posY * lda; + ao2 = a + posY + (posX) * lda; + + for (i = m; i > 0; i--, offset--) { + va2 = VLSEV_FLOAT(ao2, stride_lda, vl); + va1 = VLEV_FLOAT(ao1, vl); + + // offset > (0 - vindex) ---> (offset + vindex) > 0 + vindex = VADD_VX_INT(vindex_max, offset, vl); + vbool = VMSGT_VX_INT(vindex, 0, vl); + + vb = VMERGE_VVM_FLOAT(va2, va1, vbool, vl); + VSEV_FLOAT(b, vb, vl); + + b += vl; + ao1 += lda; + ao2++; + } + } + + return 0; +} + diff --git a/kernel/riscv64/symm_ucopy_rvv_v1.c b/kernel/riscv64/symm_ucopy_rvv_v1.c new file mode 100644 index 000000000..464f97b3a --- /dev/null +++ b/kernel/riscv64/symm_ucopy_rvv_v1.c @@ -0,0 +1,100 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT __riscv_vid_v_i32m2 +#define VADD_VX_INT __riscv_vadd_vx_i32m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT __riscv_vid_v_i64m2 +#define VADD_VX_INT __riscv_vadd_vx_i64m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 +#endif + +// Optimizes the implementation in ../generic/symm_ucopy_4.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) +{ + BLASLONG i, js, offset; + + FLOAT *ao1, *ao2; + + BLASLONG stride_lda = sizeof(FLOAT)*lda; + + FLOAT_V_T vb, va1, va2; + VBOOL_T vbool; + INT_V_T vindex_max, vindex; + + size_t vl = VSETVL_MAX; + vindex_max = VID_V_INT(vl); + + for (js = n; js > 0; js -= vl, posX += vl) { + vl = VSETVL(js); + offset = posX - posY; + + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posX + 0 + posY * lda; + + for (i = m; i > 0; i--, offset--) { + va1 = VLSEV_FLOAT(ao1, stride_lda, vl); + va2 = VLEV_FLOAT(ao2, vl); + + // offset > (0 - vindex) ---> (offset + vindex) > 0 + vindex = VADD_VX_INT(vindex_max, offset, vl); + vbool = VMSGT_VX_INT(vindex, 0, vl); + + vb = VMERGE_VVM_FLOAT(va2, va1, vbool, vl); + VSEV_FLOAT(b, vb, vl); + + b += vl; + ao1++; + ao2 += lda; + } + } + + return 0; +} diff --git a/kernel/riscv64/symv_L_rvv.c b/kernel/riscv64/symv_L_rvv.c new file mode 100644 index 000000000..888d628a5 --- /dev/null +++ b/kernel/riscv64/symv_L_rvv.c @@ -0,0 +1,219 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m8_tu +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#else +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m8_tu +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j, k; + BLASLONG ix,iy; + BLASLONG jx,jy; + FLOAT temp1; + FLOAT *a_ptr = a; + + FLOAT_V_T_M1 v_res, v_z0; + size_t vlmax = VSETVL_MAX_M1, vl; + v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + vlmax = VSETVL_MAX; + + FLOAT_V_T va, vx, vy, vr; + BLASLONG stride_x, stride_y, inc_xv, inc_yv; + + if(inc_x == 1 && inc_y == 1) + { + for (j=0; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VLEV_FLOAT(&y[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSEV_FLOAT(&y[i], vy, vl); + + vx = VLEV_FLOAT(&x[i], vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); + + } + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); + + y[j] += alpha * VFMVFS_FLOAT_M1(v_res); + a_ptr += lda; + } + } + else if(inc_x == 1) + { + jy = 0; + stride_y = inc_y * sizeof(FLOAT); + for (j=0; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + inc_yv = inc_y * vl; + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VLSEV_FLOAT(&y[iy], stride_y, vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSSEV_FLOAT(&y[iy], stride_y, vy, vl); + + vx = VLEV_FLOAT(&x[i], vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); + + iy += inc_yv; + } + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); + + y[jy] += alpha * VFMVFS_FLOAT_M1(v_res); + jy += inc_y; + a_ptr += lda; + } + } + else if(inc_y == 1) + { + jx = 0; + stride_x = inc_x * sizeof(FLOAT); + for (j=0; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + inc_xv = inc_x * vl; + + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VLEV_FLOAT(&y[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSEV_FLOAT(&y[i], vy, vl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); + + ix += inc_xv; + } + + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); + + y[j] += alpha * VFMVFS_FLOAT_M1(v_res); + jx += inc_x; + a_ptr += lda; + } + } + else + { + stride_x = inc_x * sizeof(FLOAT); + stride_y = inc_y * sizeof(FLOAT); + jx = 0; + jy = 0; + for (j=0; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + inc_xv = inc_x * vl; + inc_yv = inc_y * vl; + + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VLSEV_FLOAT(&y[iy], stride_y, vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSSEV_FLOAT(&y[iy], stride_y, vy, vl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); + + ix += inc_xv; + iy += inc_yv; + } + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); + + y[jy] += alpha * VFMVFS_FLOAT_M1(v_res); + jx += inc_x; + jy += inc_y; + a_ptr += lda; + } + } + return(0); +} + diff --git a/kernel/riscv64/symv_L_vector.c b/kernel/riscv64/symv_L_vector.c index 58ec17b03..cd89c63ec 100644 --- a/kernel/riscv64/symv_L_vector.c +++ b/kernel/riscv64/symv_L_vector.c @@ -27,37 +27,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle32_v_f32m4 -#define VLSEV_FLOAT vlse32_v_f32m4 -#define VSEV_FLOAT vse32_v_f32m4 -#define VSSEV_FLOAT vsse32_v_f32m4 -#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMULVV_FLOAT vfmul_vv_f32m4 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m4_f32m1(v_res, va, vb, gvl) #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle64_v_f64m4 -#define VLSEV_FLOAT vlse64_v_f64m4 -#define VSEV_FLOAT vse64_v_f64m4 -#define VSSEV_FLOAT vsse64_v_f64m4 -#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMULVV_FLOAT vfmul_vv_f64m4 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m4_f64m1(v_res, va, vb, gvl) +#else +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4) #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) @@ -99,8 +105,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp2 = VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + temp2 = EXTRACT_FLOAT(v_res); if(i < m){ gvl = VSETVL(m-i); vy = VLEV_FLOAT(&y[i], gvl); @@ -110,8 +116,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLEV_FLOAT(&x[i], gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp2 += VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + temp2 += EXTRACT_FLOAT(v_res); } } y[j] += alpha * temp2; @@ -144,8 +150,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; iy += inc_yv; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp2 = VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + temp2 = EXTRACT_FLOAT(v_res); if(i < m){ gvl = VSETVL(m-i); vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); @@ -155,8 +161,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLEV_FLOAT(&x[i], gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp2 += VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + temp2 += EXTRACT_FLOAT(v_res); } } y[jy] += alpha * temp2; @@ -190,8 +196,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; ix += inc_xv; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp2 = VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + temp2 = EXTRACT_FLOAT(v_res); if(i < m){ gvl = VSETVL(m-i); vy = VLEV_FLOAT(&y[i], gvl); @@ -201,8 +207,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp2 += VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + temp2 += EXTRACT_FLOAT(v_res); } } y[j] += alpha * temp2; @@ -241,8 +247,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA ix += inc_xv; iy += inc_yv; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp2 = VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + temp2 = EXTRACT_FLOAT(v_res); if(i < m){ gvl = VSETVL(m-i); vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); @@ -252,8 +258,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp2 += VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + temp2 += EXTRACT_FLOAT(v_res); } } y[jy] += alpha * temp2; diff --git a/kernel/riscv64/symv_U_rvv.c b/kernel/riscv64/symv_U_rvv.c new file mode 100644 index 000000000..bcd2f6981 --- /dev/null +++ b/kernel/riscv64/symv_U_rvv.c @@ -0,0 +1,216 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m8_tu +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#else +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m8_tu +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j, k; + BLASLONG ix,iy; + BLASLONG jx,jy; + FLOAT temp1; + FLOAT *a_ptr = a; + FLOAT_V_T_M1 v_res, v_z0; + size_t vl_max = VSETVL_MAX_M1, vl; + v_z0 = VFMVVF_FLOAT_M1(0, vl_max); + vl_max = VSETVL_MAX; + + FLOAT_V_T va, vx, vy, vr; + BLASLONG stride_x, stride_y, inc_xv, inc_yv; + + BLASLONG m1 = m - offset; + if(inc_x == 1 && inc_y == 1) + { + a_ptr += m1 * lda; + for (j=m1; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + vy = VLEV_FLOAT(&y[i], vl); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSEV_FLOAT(&y[i], vy, vl); + + vx = VLEV_FLOAT(&x[i], vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); + } + v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max); + + y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); + a_ptr += lda; + } + } + else if(inc_x == 1) + { + jy = m1 * inc_y; + a_ptr += m1 * lda; + stride_y = inc_y * sizeof(FLOAT); + for (j=m1; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + inc_yv = inc_y * vl; + vy = VLSEV_FLOAT(&y[iy], stride_y, vl); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSSEV_FLOAT(&y[iy], stride_y, vy, vl); + + vx = VLEV_FLOAT(&x[i], vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); + + iy += inc_yv; + } + v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max); + + y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); + a_ptr += lda; + jy += inc_y; + } + } + else if(inc_y == 1) + { + jx = m1 * inc_x; + a_ptr += m1 * lda; + stride_x = inc_x * sizeof(FLOAT); + for (j=m1; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + inc_xv = inc_x * vl; + + vy = VLEV_FLOAT(&y[i], vl); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSEV_FLOAT(&y[i], vy, vl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); + + ix += inc_xv; + } + v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max); + + y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); + a_ptr += lda; + jx += inc_x; + } + } + else + { + jx = m1 * inc_x; + jy = m1 * inc_y; + a_ptr += m1 * lda; + stride_x = inc_x * sizeof(FLOAT); + stride_y = inc_y * sizeof(FLOAT); + for (j=m1; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + inc_xv = inc_x * vl; + inc_yv = inc_y * vl; + vy = VLSEV_FLOAT(&y[iy], stride_y, vl); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSSEV_FLOAT(&y[iy], stride_y, vy, vl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); + ix += inc_xv; + iy += inc_yv; + } + v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max); + + y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); + a_ptr += lda; + jx += inc_x; + jy += inc_y; + } + } + return(0); +} diff --git a/kernel/riscv64/symv_U_vector.c b/kernel/riscv64/symv_U_vector.c index 34ff0e30a..894c6a643 100644 --- a/kernel/riscv64/symv_U_vector.c +++ b/kernel/riscv64/symv_U_vector.c @@ -27,39 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle32_v_f32m4 -#define VLSEV_FLOAT vlse32_v_f32m4 -#define VSEV_FLOAT vse32_v_f32m4 -#define VSSEV_FLOAT vsse32_v_f32m4 -#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFDOTVV_FLOAT vfdot_vv_f32m4 -#define VFMULVV_FLOAT vfmul_vv_f32m4 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m4_f32m1(v_res, va, vb, gvl) #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f32m4) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle64_v_f64m4 -#define VLSEV_FLOAT vlse64_v_f64m4 -#define VSEV_FLOAT vse64_v_f64m4 -#define VSSEV_FLOAT vsse64_v_f64m4 -#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFDOTVV_FLOAT vfdot_vv_f64m4 -#define VFMULVV_FLOAT vfmul_vv_f64m4 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m4_f64m1(v_res, va, vb, gvl) +#else +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f64m4) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4) #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) @@ -101,8 +107,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp2 = VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + temp2 = EXTRACT_FLOAT(v_res); if(i < j){ gvl = VSETVL(j-i); vy = VLEV_FLOAT(&y[i], gvl); @@ -112,8 +118,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLEV_FLOAT(&x[i], gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp2 += VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + temp2 += EXTRACT_FLOAT(v_res); } } y[j] += temp1 * a_ptr[j] + alpha * temp2; @@ -145,8 +151,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; iy += inc_yv; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp2 = VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + temp2 = EXTRACT_FLOAT(v_res); if(i < j){ gvl = VSETVL(j-i); vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); @@ -156,8 +162,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLEV_FLOAT(&x[i], gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp2 += VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + temp2 += EXTRACT_FLOAT(v_res); } } y[jy] += temp1 * a_ptr[j] + alpha * temp2; @@ -190,8 +196,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; ix += inc_xv; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp2 = VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + temp2 = EXTRACT_FLOAT(v_res); if(i < j){ gvl = VSETVL(j-i); vy = VLEV_FLOAT(&y[i], gvl); @@ -201,8 +207,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp2 += VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + temp2 += EXTRACT_FLOAT(v_res); } } y[j] += temp1 * a_ptr[j] + alpha * temp2; @@ -240,8 +246,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA ix += inc_xv; iy += inc_yv; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp2 = VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + temp2 = EXTRACT_FLOAT(v_res); if(i < j){ gvl = VSETVL(j-i); vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); @@ -251,8 +257,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp2 += VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + temp2 += EXTRACT_FLOAT(v_res); } } y[jy] += temp1 * a_ptr[j] + alpha * temp2; diff --git a/kernel/riscv64/trmm_lncopy_rvv_v1.c b/kernel/riscv64/trmm_lncopy_rvv_v1.c new file mode 100644 index 000000000..4135a9b62 --- /dev/null +++ b/kernel/riscv64/trmm_lncopy_rvv_v1.c @@ -0,0 +1,138 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 +#endif + +// Optimizes the implementation in ../arm64/tmmm_lncopy_sve_v1.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, X; + + FLOAT *ao; + + BLASLONG stride_lda = sizeof(FLOAT)*lda; + + FLOAT_V_T vb, va1; + + size_t vl; +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + X = posX; + + if (posX <= posY) + { + ao = a + posY + posX * lda; + } + else + { + ao = a + posX + posY * lda; + } + + i = 0; + do + { + if (X > posY) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + VSEV_FLOAT(b, va1, vl); + + ao ++; + b += vl; + X ++; + i ++; + } + else if (X < posY) + { + ao += lda; + b += vl; + X ++; + i ++; + } + else + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl); +#endif + VSEV_FLOAT(b, vb, vl); + ao++; + b += vl; + } + + X += vl; + i += vl; + } + } while (i < m); + + posY += vl; + } + + return 0; +} diff --git a/kernel/riscv64/trmm_ltcopy_rvv_v1.c b/kernel/riscv64/trmm_ltcopy_rvv_v1.c new file mode 100644 index 000000000..580714fde --- /dev/null +++ b/kernel/riscv64/trmm_ltcopy_rvv_v1.c @@ -0,0 +1,134 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 +#endif + +// Optimizes the implementation in ../arm64/tmmm_ltcopy_sve_v1.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, X; + + FLOAT *ao; + + FLOAT_V_T vb, va1; + size_t vl; +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + X = posX; + + if (posX <= posY) + { + ao = a + posY + posX * lda; + } + else + { + ao = a + posX + posY * lda; + } + + i = 0; + do + { + if (X > posY) + { + ao ++; + b += vl; + X ++; + i ++; + } + else if (X < posY) + { + va1 = VLEV_FLOAT(ao, vl); + VSEV_FLOAT(b, va1, vl); + + ao += lda; + b += vl; + X ++; + i ++; + } + else + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + va1 = VLEV_FLOAT(ao, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl); +#endif + VSEV_FLOAT(b, vb, vl); + ao += lda; + b += vl; + } + X += vl; + i += vl; + + } + } while (i < m); + + posY += vl; + } + + return 0; +} + diff --git a/kernel/riscv64/trmm_uncopy_rvv_v1.c b/kernel/riscv64/trmm_uncopy_rvv_v1.c new file mode 100644 index 000000000..852ab7f11 --- /dev/null +++ b/kernel/riscv64/trmm_uncopy_rvv_v1.c @@ -0,0 +1,136 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 +#endif + +// Optimizes the implementation in ../arm64/tmmm_uncopy_sve_v1.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, X; + BLASLONG stride_lda = sizeof(FLOAT) * lda; + FLOAT *ao; + + FLOAT_V_T vb, va1; + size_t vl; + +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + X = posX; + + if (posX <= posY) + { + ao = a + posX + posY * lda; + } + else + { + ao = a + posY + posX * lda; + } + + i = 0; + do + { + if (X < posY) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + VSEV_FLOAT(b, va1, vl); + + ao ++; + b += vl; + X ++; + i ++; + } + else if (X > posY) + { + ao += lda; + b += vl; + X ++; + i ++; + } + else + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl); +#endif + VSEV_FLOAT(b, vb, vl); + ao++; + b += vl; + } + + X += vl; + i += vl; + } + }while (i < m); + + posY += vl; + } + + return 0; +} diff --git a/kernel/riscv64/trmm_utcopy_rvv_v1.c b/kernel/riscv64/trmm_utcopy_rvv_v1.c new file mode 100644 index 000000000..e0b6d362d --- /dev/null +++ b/kernel/riscv64/trmm_utcopy_rvv_v1.c @@ -0,0 +1,133 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 +#endif + +// Optimizes the implementation in ../arm64/tmmm_utcopy_sve_v1.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, j, js, X; + + FLOAT *ao; + FLOAT_V_T vb, va1; +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + + X = posX; + + if (posX <= posY) + { + ao = a + posX + posY * lda; + } + else + { + ao = a + posY + posX * lda; + } + + i = 0; + do + { + if (X < posY) + { + ao ++; + b += vl; + X ++; + i++; + } + else if (X > posY) + { + va1 = VLEV_FLOAT(ao, vl); + VSEV_FLOAT(b, va1, vl); + ao += lda; + b += vl; + X++; + i++; + } + else + { + vindex = VID_V_UINT(vl); + for (j = 0; j < vl; j++) + { + va1 = VLEV_FLOAT(ao, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl); +#endif + VSEV_FLOAT(b, vb, vl); + ao += lda; + b += vl; + } + X += vl; + i += vl; + } + }while (i < m); + posY += vl; + } + return 0; +} + diff --git a/kernel/riscv64/trmmkernel_rvv_v1x8.c b/kernel/riscv64/trmmkernel_rvv_v1x8.c new file mode 100644 index 000000000..393b24bce --- /dev/null +++ b/kernel/riscv64/trmmkernel_rvv_v1x8.c @@ -0,0 +1,685 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2 +#endif + + +// Optimizes the implementation in ../generic/trmmkernel_8x8.c + + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) +{ + //fprintf(stderr, "%s, %s, bm=%4ld bn=%4ld bk=%4ld alpha=%f ldc=%ld\n", __FILE__, __FUNCTION__, bm, bn, bk, alpha, ldc); + + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb; + + FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; + FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; + size_t vl; + + BLASLONG off, temp; + +#if !defined(LEFT) + off = -offset; +#else + off = 0; +#endif + for (j = bn/8; j > 0; j--) + { + C0 = C; + C1 = C0+ldc; + C2 = C1+ldc; + C3 = C2+ldc; + C4 = C3+ldc; + C5 = C4+ldc; + C6 = C5+ldc; + C7 = C6+ldc; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + ptrba = ba; + + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl; + ptrbb = bb + off*8; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + vres4 = VFMVVF_FLOAT(0.0, vl); + vres5 = VFMVVF_FLOAT(0.0, vl); + vres6 = VFMVVF_FLOAT(0.0, vl); + vres7 = VFMVVF_FLOAT(0.0, vl); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+8; // number of values in B +#endif + + for (k = temp/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); + ptrbb += 8; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va1, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va1, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va1, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va1, vl); + ptrbb += 8; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va2, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va2, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va2, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va2, vl); + ptrbb += 8; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va3, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va3, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va3, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va3, vl); + ptrbb += 8; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va4, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va4, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va4, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va4, vl); + ptrbb += 8; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va5, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va5, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va5, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va5, vl); + ptrbb += 8; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va6, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va6, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va6, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va6, vl); + ptrbb += 8; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va7, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va7, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va7, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va7, vl); + ptrbb += 8; + } + + for (k = temp&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); // M:8 (should be vlen); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); + + ptrbb += 8; + ptrba += vl; + } + + va0 = VFMULVF_FLOAT(vres0, alpha, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VFMULVF_FLOAT(vres1, alpha, vl); + VSEV_FLOAT(C1, va1, vl); + + va2 = VFMULVF_FLOAT(vres2, alpha, vl); + VSEV_FLOAT(C2, va2, vl); + + va3 = VFMULVF_FLOAT(vres3, alpha, vl); + VSEV_FLOAT(C3, va3, vl); + + va4 = VFMULVF_FLOAT(vres4, alpha, vl); + VSEV_FLOAT(C4, va4, vl); + + va5 = VFMULVF_FLOAT(vres5, alpha, vl); + VSEV_FLOAT(C5, va5, vl); + + va6 = VFMULVF_FLOAT(vres6, alpha, vl); + VSEV_FLOAT(C6, va6, vl); + + va7 = VFMULVF_FLOAT(vres7, alpha, vl); + VSEV_FLOAT(C7, va7, vl); + + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 8; // number of values in B +#endif + ptrba += temp*vl; + ptrbb += temp*8; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + + C0 += vl; + C1 += vl; + C2 += vl; + C3 += vl; + C4 += vl; + C5 += vl; + C6 += vl; + C7 += vl; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 8; +#endif + + bb += (bk<<3); + C += (ldc<<3); + } + + if (bn & 4) + { + C0 = C; + C1 = C0+ldc; + C2 = C1+ldc; + C3 = C2+ldc; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl; + ptrbb = bb + off*4; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+4; // number of values in B +#endif + + for (k = temp/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + ptrbb += 4; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); + ptrbb += 4; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); + ptrbb += 4; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); + ptrbb += 4; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); + ptrbb += 4; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); + ptrbb += 4; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); + ptrbb += 4; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); + ptrbb += 4; + } + + // K remainder + for (k = temp&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + + ptrbb += 4; + ptrba += vl; + } + + va0 = VFMULVF_FLOAT(vres0, alpha, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VFMULVF_FLOAT(vres1, alpha, vl); + VSEV_FLOAT(C1, va1, vl); + + va2 = VFMULVF_FLOAT(vres2, alpha, vl); + VSEV_FLOAT(C2, va2, vl); + + va3 = VFMULVF_FLOAT(vres3, alpha, vl); + VSEV_FLOAT(C3, va3, vl); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 4; // number of values in B +#endif + ptrba += temp*vl; + ptrbb += temp*4; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + + C0 += vl; + C1 += vl; + C2 += vl; + C3 += vl; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; +#endif + + bb += (bk<<2); + C += (ldc<<2); + } + + if (bn & 2) + { + C0 = C; + C1 = C0+ldc; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + ptrba = ba; + + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl; + ptrbb = bb + off*2; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+2; // number of values in B +#endif + + for (k = temp/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + ptrbb += 2; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + ptrbb += 2; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + ptrbb += 2; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + ptrbb += 2; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + ptrbb += 2; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + ptrbb += 2; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + ptrbb += 2; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + ptrbb += 2; + } + + // K remainder + for (k = temp&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + + ptrbb += 2; + ptrba += vl; + } + va0 = VFMULVF_FLOAT(vres0, alpha, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VFMULVF_FLOAT(vres1, alpha, vl); + VSEV_FLOAT(C1, va1, vl); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 2; // number of values in B +#endif + ptrba += temp*vl; + ptrbb += temp*2; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + + C0 += vl; + C1 += vl; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; +#endif + + bb += (bk<<1); + C += (ldc<<1); + } + + if (bn & 1) + { + C0 = C; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + ptrba = ba; + + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl; + ptrbb = bb + off*1; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+1; // number of values in B +#endif + + for (k = temp/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + ptrbb += 1; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + ptrbb += 1; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + ptrbb += 1; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + ptrbb += 1; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + ptrbb += 1; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + ptrbb += 1; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + ptrbb += 1; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + ptrbb += 1; + } + + // K remainder + for (k = temp&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + + ptrbb += 1; + ptrba += vl; + } + va0 = VFMULVF_FLOAT(vres0, alpha, vl); + VSEV_FLOAT(C0, va0, vl); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 1; // number of values in B +#endif + ptrba += temp*vl; + ptrbb += temp*1; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + + C0 += vl; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; +#endif + + bb += (bk); + C += (ldc); + } + return 0; +} + diff --git a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c new file mode 100644 index 000000000..869561fb3 --- /dev/null +++ b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c @@ -0,0 +1,364 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2x2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2x2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2 +#endif + + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +// Optimizes the implementation in ../arm64/trsm_kernel_LN_sve.c + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT aa; + FLOAT* pc; + + int i, j, k; + + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + + FLOAT_V_T vb, vc; + + size_t vl; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) { + + aa = *(a + i); + pc = c; + for (j = n; j > 0; j -= vl) { + vl = VSETVL(j); + vb = VLSEV_FLOAT(pc + i, stride_ldc, vl); + vb = VFMULVF_FLOAT(vb, aa, vl); + VSEV_FLOAT(b, vb, vl); + VSSEV_FLOAT(pc + i, stride_ldc, vb, vl); + b += vl; + + for (k = 0; k < i; k ++) { + vc = VLSEV_FLOAT(pc + k, stride_ldc, vl); + vc = VFNMSACVF_FLOAT(vc, *(a + k), vb, vl); + VSSEV_FLOAT(pc + k, stride_ldc, vc, vl); + } + pc += vl * ldc; + } + a -= m; + b -= 2 * n; + } + +} +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT *pc; + int i, j, k; + + BLASLONG stride_ldc = sizeof(FLOAT) * ldc * 2; + + FLOAT_VX2_T vbx2, vsx2, vcx2; + FLOAT_V_T vb1, vb2, vc1, vc2, vs1, vs2; + size_t vl; + a += (m - 1) * m * 2; + b += (m - 1) * n * 2; + + for (i = m - 1; i >= 0; i--) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + pc = c; + + for (j = n; j > 0; j -= vl) { + vl = VSETVL(j); + vbx2 = VLSSEG2_FLOAT(pc + i * 2, stride_ldc, vl); + vb1 = VGET_VX2(vbx2, 0); + vb2 = VGET_VX2(vbx2, 1); +#ifndef CONJ + vs1 = VFMULVF_FLOAT(vb1, aa1, vl); + vs1 = VFNMSACVF_FLOAT(vs1, aa2, vb2, vl); + vs2 = VFMULVF_FLOAT(vb2, aa1, vl); + vs2 = VFMACCVF_FLOAT(vs2, aa2, vb1, vl); +#else + vs1 = VFMULVF_FLOAT(vb1, aa1, vl); + vs1 = VFMACCVF_FLOAT(vs1, aa2, vb2, vl); + vs2 = VFMULVF_FLOAT(vb2, aa1, vl); + vs2 = VFNMSACVF_FLOAT(vs2, aa2, vb1, vl); +#endif + vsx2 = VSET_VX2(vsx2, 0, vs1); + vsx2 = VSET_VX2(vsx2, 1, vs2); + VSSEG2_FLOAT(b, vsx2, vl); + VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vsx2, vl); + b += vl * 2; + + for (k = 0; k < i; k ++) { + vcx2 = VLSSEG2_FLOAT(pc + k * 2, stride_ldc, vl); + vc1 = VGET_VX2(vcx2, 0); + vc2 = VGET_VX2(vcx2, 1); +#ifndef CONJ + vc1 = VFMACCVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl); +#else + vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl); + vc2 = VFMACCVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl); +#endif + vcx2 = VSET_VX2(vcx2, 0, vc1); + vcx2 = VSET_VX2(vcx2, 1, vc2); + VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vcx2, vl); + } + pc += vl * ldc * 2; + } + a -= m * 2; + b -= 4 * n; + } +} + + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + + size_t vl = VSETVL_MAX; + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = m + offset; + + i = m % vl; + if (i) { + aa = a + (m - i) * k * COMPSIZE; + cc = c + (m - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + kk -= i; + + } + + int mod = i; + i = vl; + if (i <= m) { + aa = a + (m - mod - vl) * k * COMPSIZE; + cc = c + (m - mod - vl) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(vl, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + vl * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(vl, GEMM_UNROLL_N, + aa + (kk - vl) * vl * COMPSIZE, + b + (kk - vl) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa -= vl * k * COMPSIZE; + cc -= vl * COMPSIZE; + kk -= vl; + + i += vl; + } while (i <= m); + } + + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = m + offset; + + i = m % vl; + if (i) { + aa = a + (m - i) * k * COMPSIZE; + cc = c + (m - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * j * COMPSIZE, + cc, ldc); + + kk -= i; + + } + + int mod = i; + i = vl; + if (i <= m) { + aa = a + (m - mod - vl) * k * COMPSIZE; + cc = c + (m - mod - vl) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(vl, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + vl * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(vl, j, + aa + (kk - vl) * vl * COMPSIZE, + b + (kk - vl) * j * COMPSIZE, + cc, ldc); + + aa -= vl * k * COMPSIZE; + cc -= vl * COMPSIZE; + kk -= vl; + + i += vl; + } while (i <= m); + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c new file mode 100644 index 000000000..da443cfba --- /dev/null +++ b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c @@ -0,0 +1,341 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2x2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2x2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2 +#endif + + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +// Optimizes the implementation in ../arm64/trsm_kernel_LT_sve.c + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa; + FLOAT* pc; + + int i, j, k; + + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + + FLOAT_V_T vb, vc; + + size_t vl; + + for (i = 0; i < m; i++) { + + aa = *(a + i); + pc = c; + for (j = n; j > 0; j -= vl) { + vl = VSETVL(j); + vb = VLSEV_FLOAT(pc + i, stride_ldc, vl); + vb = VFMULVF_FLOAT(vb, aa, vl); + VSEV_FLOAT(b, vb, vl); + VSSEV_FLOAT(pc + i, stride_ldc, vb, vl); + b += vl; + + for (k = i + 1; k < m; k++) { + vc = VLSEV_FLOAT(pc + k, stride_ldc, vl); + vc = VFNMSACVF_FLOAT(vc, *(a + k), vb, vl); + VSSEV_FLOAT(pc + k, stride_ldc, vc, vl); + } + pc += vl * ldc; + } + a += m; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT *pc; + int i, j, k; + + BLASLONG stride_ldc = sizeof(FLOAT) * ldc * 2; + + FLOAT_VX2_T vbx2, vsx2, vcx2; + FLOAT_V_T vb1, vb2, vc1, vc2, vs1, vs2; + size_t vl; + + ldc *= 2; + + for (i = 0; i < m; i++) { + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + pc = c; + + for (j = n; j > 0; j -= vl) { + vl = VSETVL(j); + vbx2 = VLSSEG2_FLOAT(pc + i * 2, stride_ldc, vl); + vb1 = VGET_VX2(vbx2, 0); + vb2 = VGET_VX2(vbx2, 1); +#ifndef CONJ + vs1 = VFMULVF_FLOAT(vb1, aa1, vl); + vs1 = VFNMSACVF_FLOAT(vs1, aa2, vb2, vl); + vs2 = VFMULVF_FLOAT(vb2, aa1, vl); + vs2 = VFMACCVF_FLOAT(vs2, aa2, vb1, vl); +#else + vs1 = VFMULVF_FLOAT(vb1, aa1, vl); + vs1 = VFMACCVF_FLOAT(vs1, aa2, vb2, vl); + vs2 = VFMULVF_FLOAT(vb2, aa1, vl); + vs2 = VFNMSACVF_FLOAT(vs2, aa2, vb1, vl); +#endif + vsx2 = VSET_VX2(vsx2, 0, vs1); + vsx2 = VSET_VX2(vsx2, 1, vs2); + VSSEG2_FLOAT(b, vsx2, vl); + VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vsx2, vl); + b += vl * 2; + + for (k = i + 1; k < m; k++) { + vcx2 = VLSSEG2_FLOAT(pc + k * 2, stride_ldc, vl); + vc1 = VGET_VX2(vcx2, 0); + vc2 = VGET_VX2(vcx2, 1); +#ifndef CONJ + vc1 = VFMACCVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl); +#else + vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl); + vc2 = VFMACCVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl); +#endif + vcx2 = VSET_VX2(vcx2, 0, vc1); + vcx2 = VSET_VX2(vcx2, 1, vc2); + VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vcx2, vl); + } + pc += vl * ldc * 2; + } + + a += m * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j; + + size_t vl = VSETVL_MAX; + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = offset; + aa = a; + cc = c; + + i = vl; + + while (i <= m) { + + if (kk > 0) { + GEMM_KERNEL(vl, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + solve(vl, GEMM_UNROLL_N, + aa + kk * vl * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + kk += vl; + i += vl; + } + + i = m % vl; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = offset; + aa = a; + cc = c; + + i = vl; + + while (i <= m) { + if (kk > 0) { + GEMM_KERNEL(vl, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(vl, j, + aa + kk * vl * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + kk += vl; + i += vl; + } + + i = m % vl; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c new file mode 100644 index 000000000..32e481036 --- /dev/null +++ b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c @@ -0,0 +1,337 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2 +#endif + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +// Optimizes the implementation in ../arm64/trsm_kernel_RN_sve.c + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT bb; + FLOAT *pci, *pcj; + + int i, j, k; + FLOAT_V_T va, vc; + + size_t vl; + for (i = 0; i < n; i++) { + + bb = *(b + i); + pci = c + i * ldc; + pcj = c; + for (j = m; j > 0; j -= vl) { + vl = VSETVL(j); + va = VLEV_FLOAT(pci, vl); + va = VFMULVF_FLOAT(va, bb, vl); + VSEV_FLOAT(a, va, vl); + VSEV_FLOAT(pci, va, vl); + a += vl; + pci += vl; + for (k = i + 1; k < n; k ++){ + vc = VLEV_FLOAT(pcj + k * ldc, vl); + vc = VFNMSACVF_FLOAT(vc, *(b + k), va, vl); + VSEV_FLOAT(pcj + k * ldc, vc, vl); + } + pcj += vl; + } + b += n; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT bb1, bb2; + + FLOAT *pci, *pcj; + + int i, j, k; + + FLOAT_VX2_T vax2, vsx2, vcx2; + FLOAT_V_T va1, va2, vs1, vs2, vc1, vc2; + + size_t vl; + + for (i = 0; i < n; i++) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + pci = c + i * ldc * 2; + pcj = c; + + for (j = m; j > 0; j -= vl) { + vl = VSETVL(j); + vax2 = VLSEG2_FLOAT(pci, vl); + va1 = VGET_VX2(vax2, 0); + va2 = VGET_VX2(vax2, 1); +#ifndef CONJ + vs1 = VFMULVF_FLOAT(va1, bb1, vl); + vs1 = VFNMSACVF_FLOAT(vs1, bb2, va2, vl); + vs2 = VFMULVF_FLOAT(va1, bb2, vl); + vs2 = VFMACCVF_FLOAT(vs2, bb1, va2, vl); +#else + vs1 = VFMULVF_FLOAT(va1, bb1, vl); + vs1 = VFMACCVF_FLOAT(vs1, bb2, va2, vl); + vs2 = VFMULVF_FLOAT(va2, bb1, vl); + vs2 = VFNMSACVF_FLOAT(vs2, bb2, va1, vl); +#endif + vsx2 = VSET_VX2(vsx2, 0, vs1); + vsx2 = VSET_VX2(vsx2, 1, vs2); + VSSEG2_FLOAT(a, vsx2, vl); + VSSEG2_FLOAT(pci, vsx2, vl); + a += vl * 2; + pci += vl * 2; + + for (k = i + 1; k < n; k ++){ + vcx2 = VLSEG2_FLOAT(pcj + k * ldc * 2, vl); + vc1 = VGET_VX2(vcx2, 0); + vc2 = VGET_VX2(vcx2, 1); +#ifndef CONJ + vc1 = VFMACCVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl); +#else + vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl); + vc2 = VFMACCVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl); +#endif + vcx2 = VSET_VX2(vcx2, 0, vc1); + vcx2 = VSET_VX2(vcx2, 1, vc2); + VSSEG2_FLOAT(pcj + k * ldc * 2, vcx2, vl); + } + pcj += vl * 2; + } + b += n * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j; + + size_t vl = VSETVL_MAX; + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug + + + j = (n >> GEMM_UNROLL_N_SHIFT); + kk = -offset; + + while (j > 0) { + + aa = a; + cc = c; + + i = vl; + + if (i <= m) { + do { + if (kk > 0) { + GEMM_KERNEL(vl, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + solve(vl, GEMM_UNROLL_N, + aa + kk * vl * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + i += vl; + } while (i <= m); + } + + + i = m % vl; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + + kk += GEMM_UNROLL_N; + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + aa = a; + cc = c; + + i = vl; + + while (i <= m) { + if (kk > 0) { + GEMM_KERNEL(vl, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(vl, j, + aa + kk * vl * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + i += vl; + } + + i = m % vl; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + kk += j; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c new file mode 100644 index 000000000..81cc41818 --- /dev/null +++ b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c @@ -0,0 +1,356 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2 +#endif + + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +// Optimizes the implementation in ../arm64/trsm_kernel_RT_sve.c + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT bb; + FLOAT *pci, *pcj; + + int i, j, k; + FLOAT_V_T va, vc; + + size_t vl; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) { + + bb = *(b + i); + pci = c + i * ldc; + pcj = c; + for (j = m; j > 0; j -= vl) { + vl = VSETVL(j); + va = VLEV_FLOAT(pci, vl); + va = VFMULVF_FLOAT(va, bb, vl); + VSEV_FLOAT(a, va, vl); + VSEV_FLOAT(pci, va, vl); + a += vl; + pci += vl; + for (k = 0; k < i; k ++){ + vc = VLEV_FLOAT(pcj + k * ldc, vl); + vc = VFNMSACVF_FLOAT(vc, *(b + k), va, vl); + VSEV_FLOAT(pcj + k * ldc, vc, vl); + } + pcj += vl; + } + b -= n; + a -= 2 * m; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT bb1, bb2; + + FLOAT *pci, *pcj; + + int i, j, k; + + FLOAT_VX2_T vax2, vsx2, vcx2; + FLOAT_V_T va1, va2, vs1, vs2, vc1, vc2; + + size_t vl; + + a += (n - 1) * m * 2; + b += (n - 1) * n * 2; + + for (i = n - 1; i >= 0; i--) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + pci = c + i * ldc * 2; + pcj = c; + for (j = m; j > 0; j -= vl) { + vl = VSETVL(j); + vax2 = VLSEG2_FLOAT(pci, vl); + va1 = VGET_VX2(vax2, 0); + va2 = VGET_VX2(vax2, 1); +#ifndef CONJ + vs1 = VFMULVF_FLOAT(va1, bb1, vl); + vs1 = VFNMSACVF_FLOAT(vs1, bb2, va2, vl); + vs2 = VFMULVF_FLOAT(va1, bb2, vl); + vs2 = VFMACCVF_FLOAT(vs2, bb1, va2, vl); +#else + vs1 = VFMULVF_FLOAT(va1, bb1, vl); + vs1 = VFMACCVF_FLOAT(vs1, bb2, va2, vl); + vs2 = VFMULVF_FLOAT(va2, bb1, vl); + vs2 = VFNMSACVF_FLOAT(vs2, bb2, va1, vl); +#endif + vsx2 = VSET_VX2(vsx2, 0, vs1); + vsx2 = VSET_VX2(vsx2, 1, vs2); + VSSEG2_FLOAT(a, vsx2, vl); + VSSEG2_FLOAT(pci, vsx2, vl); + a += vl * 2; + pci += vl * 2; + + for (k = 0; k < i; k ++){ + vcx2 = VLSEG2_FLOAT(pcj + k * ldc * 2, vl); + vc1 = VGET_VX2(vcx2, 0); + vc2 = VGET_VX2(vcx2, 1); +#ifndef CONJ + vc1 = VFMACCVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl); +#else + vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl); + vc2 = VFMACCVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl); +#endif + vcx2 = VSET_VX2(vcx2, 0, vc1); + vcx2 = VSET_VX2(vcx2, 1, vc2); + VSSEG2_FLOAT(pcj + k * ldc * 2, vcx2, vl); + } + pcj += vl * 2; + } + b -= n * 2; + a -= 4 * m; + } +} + +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + + size_t vl = VSETVL_MAX; + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug + + kk = n - offset; + c += n * ldc * COMPSIZE; + b += n * k * COMPSIZE; + + if (n & (GEMM_UNROLL_N - 1)) { + + j = 1; + while (j < GEMM_UNROLL_N) { + if (n & j) { + + aa = a; + b -= j * k * COMPSIZE; + c -= j * ldc* COMPSIZE; + cc = c; + + i = vl; + if (i <= m) { + + do { + if (k - kk > 0) { + GEMM_KERNEL(vl, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + vl * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(vl, j, + aa + (kk - j) * vl * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + i += vl; + } while (i <= m); + } + + i = m % vl; + if (i) { + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - j) * i * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + kk -= j; + } + j <<= 1; + } + } + + j = (n >> GEMM_UNROLL_N_SHIFT); + + if (j > 0) { + + do { + aa = a; + b -= GEMM_UNROLL_N * k * COMPSIZE; + c -= GEMM_UNROLL_N * ldc * COMPSIZE; + cc = c; + + i = vl; + if (i <= m) { + do { + if (k - kk > 0) { + GEMM_KERNEL(vl, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + vl * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(vl, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * vl * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + i += vl; + } while (i <= m); + } + + i = m % vl; + if (i) { + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + + kk -= GEMM_UNROLL_N; + j --; + } while (j > 0); + } + + return 0; +} + + diff --git a/kernel/riscv64/trsm_lncopy_rvv_v1.c b/kernel/riscv64/trsm_lncopy_rvv_v1.c new file mode 100644 index 000000000..41c84be25 --- /dev/null +++ b/kernel/riscv64/trsm_lncopy_rvv_v1.c @@ -0,0 +1,122 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VSEV_FLOAT_M __riscv_vse32_v_f32m2_m +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VSEV_FLOAT_M __riscv_vse64_v_f64m2_m +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 + +#endif + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +// Optimizes the implementation in ../arm64/trsm_lncopy_sve.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj, js; + + FLOAT *ao; + + jj = offset; + + BLASLONG stride_lda = sizeof(FLOAT)*lda; + + FLOAT_V_T va1; + VBOOL_T vbool_cmp; + UINT_V_T vindex; + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + ii = 0; + for (i = 0; i < m;) + { + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + VSEV_FLOAT_M(vbool_cmp, b, va1, vl); + + *(b + j) = INV(*(ao + j * lda)); + ao++; + b += vl; + } + i += vl; + ii += vl; + } + else + { + if (ii > jj) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + VSEV_FLOAT(b, va1, vl); + } + ao++; + b += vl; + i++; + ii++; + } + } + + a += vl * lda; + jj += vl; + } + + return 0; +} diff --git a/kernel/riscv64/trsm_ltcopy_rvv_v1.c b/kernel/riscv64/trsm_ltcopy_rvv_v1.c new file mode 100644 index 000000000..003bd3465 --- /dev/null +++ b/kernel/riscv64/trsm_ltcopy_rvv_v1.c @@ -0,0 +1,122 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VSEV_FLOAT_M __riscv_vse32_v_f32m2_m +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VSEV_FLOAT_M __riscv_vse64_v_f64m2_m +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 +#endif + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +// Optimizes the implementation in ../arm64/trsm_ltcopy_sve.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj, js; + + FLOAT *ao; + + jj = offset; + + FLOAT_V_T va1; + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + ii = 0; + for (i = 0; i < m;) + { + + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + *(b + j) = INV(*(ao + j)); + + va1 = VLEV_FLOAT(ao, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + VSEV_FLOAT_M(vbool_cmp, b, va1, vl); + + b += vl; + ao += lda; + } + i += vl; + ii += vl; + } + else + { + if (ii < jj) + { + va1 = VLEV_FLOAT(ao, vl); + VSEV_FLOAT(b, va1, vl); + } + ao += lda; + b += vl; + i ++; + ii ++; + } + } + + a += vl; + jj += vl; + } + return 0; +} + diff --git a/kernel/riscv64/trsm_uncopy_rvv_v1.c b/kernel/riscv64/trsm_uncopy_rvv_v1.c new file mode 100644 index 000000000..6cca5d49c --- /dev/null +++ b/kernel/riscv64/trsm_uncopy_rvv_v1.c @@ -0,0 +1,121 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VSEV_FLOAT_M __riscv_vse32_v_f32m2_m +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VSEV_FLOAT_M __riscv_vse64_v_f64m2_m +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 +#endif + + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +// Optimizes the implementation in ../arm64/trsm_uncopy_sve.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj, js; + BLASLONG stride_lda = sizeof(FLOAT)*lda; + + FLOAT *ao; + jj = offset; + + FLOAT_V_T va1; + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + i = 0; + ii = 0; + for (i = 0; i < m;) + { + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + *(b + j) = INV(*(ao + j * lda)); + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + VSEV_FLOAT_M(vbool_cmp, b, va1, vl); + ao++; + b += vl; + } + i += vl; + ii += vl; + } + else + { + if (ii < jj) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + VSEV_FLOAT(b, va1, vl); + } + ao++; + b += vl; + i++; + ii++; + } + } + + a += vl * lda; + jj += vl; + } + return 0; +} diff --git a/kernel/riscv64/trsm_utcopy_rvv_v1.c b/kernel/riscv64/trsm_utcopy_rvv_v1.c new file mode 100644 index 000000000..bc058525f --- /dev/null +++ b/kernel/riscv64/trsm_utcopy_rvv_v1.c @@ -0,0 +1,123 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VSEV_FLOAT_M __riscv_vse32_v_f32m2_m +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VSEV_FLOAT_M __riscv_vse64_v_f64m2_m +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 +#endif + + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +// Optimizes the implementation in ../arm64/trsm_utcopy_sve.c + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj, js; + + FLOAT *ao; + + jj = offset; + FLOAT_V_T va1; + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + ii = 0; + for (i = 0; i < m;) + { + + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + va1 = VLEV_FLOAT(ao, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + VSEV_FLOAT_M(vbool_cmp, b, va1, vl); + *(b + j) = INV(*(ao + j)); + + ao += lda; + b += vl; + } + i += vl; + ii += vl; + } + else + { + if (ii > jj) + { + va1 = VLEV_FLOAT(ao, vl); + VSEV_FLOAT(b, va1, vl); + } + ao += lda; + b += vl; + i ++; + ii ++; + } + } + + a += vl; + jj += vl; + } + + return 0; +} diff --git a/kernel/riscv64/zamax_rvv.c b/kernel/riscv64/zamax_rvv.c new file mode 100644 index 000000000..180cf059a --- /dev/null +++ b/kernel/riscv64/zamax_rvv.c @@ -0,0 +1,124 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m4_tu +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m4_tu +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT maxf=0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + FLOAT_V_T v0, v1, vmax; + FLOAT_V_T_M1 v_res; + FLOAT_VX2_T vx2; + + v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmax = VFMVVF_FLOAT(0.0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + vx2 = VLSEG_FLOAT(x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v0 = VFADDVV_FLOAT(v0, v1, vl); + vmax = VFMAXVV_FLOAT_TU(vmax, vmax, v0, vl); + + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + vx2 = VLSSEG_FLOAT(x, stride_x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v0 = VFADDVV_FLOAT(v0, v1, vl); + vmax = VFMAXVV_FLOAT_TU(vmax, vmax, v0, vl); + } + + } + + v_res = VFREDMAXVS_FLOAT(vmax, v_res, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + + return(maxf); +} diff --git a/kernel/riscv64/zamax_vector.c b/kernel/riscv64/zamax_vector.c index bfb282ae0..ec4a5a1e9 100644 --- a/kernel/riscv64/zamax_vector.c +++ b/kernel/riscv64/zamax_vector.c @@ -28,39 +28,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMAXVV_FLOAT vfmax_vv_f32m8 -#define VFADDVV_FLOAT vfadd_vv_f32m8 - +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 32 +# else +# define ELEN 32 +# define MLEN 16 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMAXVV_FLOAT vfmax_vv_f64m8 -#define VFADDVV_FLOAT vfadd_vv_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 8 +# else +# define ELEN 32 +# define MLEN 4 +# endif +#endif + +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMAXVS_FLOAT(va,vb,gvl) JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) (v_res, va, vb, gvl) +#else +#define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) #endif +#define MASK_T JOIN(vbool, MLEN, _t, _, _) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) +#define VFMAXVV_FLOAT JOIN(RISCV_RVV(vfmax), _vv_f, ELEN, LMUL, _) +#define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) +#define VFABSV_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { @@ -70,10 +78,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(maxf); unsigned int gvl = 0; FLOAT_V_T v0, v1, v_max; - FLOAT_V_T_M1 v_res, v_z0; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(0, 1); MASK_T mask0, mask1; BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; @@ -83,10 +89,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for(; i maxf) - maxf = VFMVFS_FLOAT(v_res); + v_res = VFREDMAXVS_FLOAT(v1, v_res, gvl); } + maxf = EXTRACT_FLOAT(v_res); return(maxf); } diff --git a/kernel/riscv64/zamin_rvv.c b/kernel/riscv64/zamin_rvv.c new file mode 100644 index 000000000..56a467502 --- /dev/null +++ b/kernel/riscv64/zamin_rvv.c @@ -0,0 +1,123 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m4_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m4_tu +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m4_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m4_tu +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT minf=0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + FLOAT_V_T v0, v1, vmin; + FLOAT_V_T_M1 v_res; + FLOAT_VX2_T vx2; + + v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmin = VFMVVF_FLOAT(FLT_MAX, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + vx2 = VLSEG_FLOAT(x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v0 = VFADDVV_FLOAT(v0, v1, vl); + vmin = VFMINVV_FLOAT_TU(vmin, vmin, v0, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + vx2 = VLSSEG_FLOAT(x, stride_x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v0 = VFADDVV_FLOAT(v0, v1, vl); + vmin = VFMINVV_FLOAT_TU(vmin, vmin, v0, vl); + } + + } + + v_res = VFREDMINVS_FLOAT(vmin, v_res, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + + return(minf); +} diff --git a/kernel/riscv64/zamin_vector.c b/kernel/riscv64/zamin_vector.c index d9eca7f10..45b3e0b9d 100644 --- a/kernel/riscv64/zamin_vector.c +++ b/kernel/riscv64/zamin_vector.c @@ -29,38 +29,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMINVV_FLOAT vfmin_vv_f32m8 -#define VFADDVV_FLOAT vfadd_vv_f32m8 + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 32 +# else +# define ELEN 32 +# define MLEN 16 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMINVV_FLOAT vfmin_vv_f64m8 -#define VFADDVV_FLOAT vfadd_vv_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 8 +# else +# define ELEN 32 +# define MLEN 4 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMINVS_FLOAT(va,vb,gvl) JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) (v_res, va, vb, gvl) +#else +#define VFREDMINVS_FLOAT JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#endif +#define MASK_T JOIN(vbool, MLEN, _t, _, _) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) +#define VFMINVV_FLOAT JOIN(RISCV_RVV(vfmin), _vv_f, ELEN, LMUL, _) +#define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) +#define VFABSV_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0, j=0; @@ -69,10 +80,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT minf=FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_min; - FLOAT_V_T_M1 v_res, v_max; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); MASK_T mask0, mask1; BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; @@ -82,10 +91,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for(; i 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + v0 = VLEV_FLOAT(x, vl); + v1 = VLEV_FLOAT(x+vl, vl); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl); + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl); + } + + } + else { + + int stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + v0 = VLSEV_FLOAT(x, stride_x, vl); + v1 = VLSEV_FLOAT(x+1, stride_x, vl); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl); + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl); + } + + } + + FLOAT_V_T_M1 v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_res = VFREDSUMVS_FLOAT(v_sum, v_res, vlmax); + asumf += VFMVFS_FLOAT_M1(v_res); + + return(asumf); +} diff --git a/kernel/riscv64/zasum_vector.c b/kernel/riscv64/zasum_vector.c index 0d1cc42f1..fca904d6a 100644 --- a/kernel/riscv64/zasum_vector.c +++ b/kernel/riscv64/zasum_vector.c @@ -28,37 +28,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VFFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFADDVV_FLOAT vfadd_vv_f32m8 +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN _b32 +# else +# define ELEN 32 +# define MLEN _b16 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VFFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFADDVV_FLOAT vfadd_vv_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN _b8 +# else +# define ELEN 32 +# define MLEN _b4 +# endif #endif + +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUMVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) +#else +#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#endif +#define VFABS_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) +#define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) +#define VMFLTVF_FLOAT JOIN(RISCV_RVV(vmflt), _vf_f, ELEN, LMUL, MLEN) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0, j=0; @@ -67,12 +77,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(asumf); unsigned int gvl = 0; FLOAT_V_T v0, v1, v_zero,v_sum; - FLOAT_V_T_M1 v_res, v_z0; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(0, 1); - MASK_T mask0, mask1; if(inc_x == 1){ BLASLONG n2 = n * 2; gvl = VSETVL(n2); @@ -81,26 +88,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_sum = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i 0; n -= vl, y += vl*inc_y2) + { + vl = VSETVL(n); + VSSSEG_FLOAT(y, stride_y, vxx2, vl); + } + } + else + { + for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2) + { + vl = VSETVL(n); + + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + + vy0 = VFMULVF_FLOAT(vx1, alpha_i, vl); + vy0 = VFMSACVF_FLOAT(vy0, alpha_r, vx0, vl); + + vy1 = VFMULVF_FLOAT(vx1, alpha_r, vl); + vy1 = VFMACCVF_FLOAT(vy1, alpha_i, vx0, vl); + + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); + } + } + } + else + { + FLOAT_V_T v0, v1; + FLOAT_VX2_T v_x2; + + if ( alpha_r == 0.0 && alpha_i == 0.0 ) + { + for (size_t vl; n > 0; n -= vl, y += vl*inc_y2) + { + vl = VSETVL(n); + + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + v0 = VFMULVF_FLOAT(vy1, beta_i, vl); + v0 = VFMSACVF_FLOAT(v0, beta_r, vy0, vl); + + v1 = VFMULVF_FLOAT(vy1, beta_r, vl); + v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl); + + v_x2 = VSET_VX2(v_x2, 0, v0); + v_x2 = VSET_VX2(v_x2, 1, v1); + VSSSEG_FLOAT(y, stride_y, v_x2, vl); + } + } + else + { + for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2) + { + vl = VSETVL(n); + + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + v0 = VFMULVF_FLOAT(vx0, alpha_r, vl); + v0 = VFNMSACVF_FLOAT(v0, alpha_i, vx1, vl); + v0 = VFMACCVF_FLOAT(v0, beta_r, vy0, vl); + v0 = VFNMSACVF_FLOAT(v0, beta_i, vy1, vl); + + v1 = VFMULVF_FLOAT(vx1, alpha_r, vl); + v1 = VFMACCVF_FLOAT(v1, alpha_i, vx0, vl); + v1 = VFMACCVF_FLOAT(v1, beta_r, vy1, vl); + v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl); + + v_x2 = VSET_VX2(v_x2, 0, v0); + v_x2 = VSET_VX2(v_x2, 1, v1); + + VSSSEG_FLOAT(y, stride_y, v_x2, vl); + } + } + } + return(0); + +} diff --git a/kernel/riscv64/zaxpby_vector.c b/kernel/riscv64/zaxpby_vector.c index 5e6034ac5..bbf2bbe7d 100644 --- a/kernel/riscv64/zaxpby_vector.c +++ b/kernel/riscv64/zaxpby_vector.c @@ -28,25 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT vlse32_v_f32m4 -#define VSSEV_FLOAT vsse32_v_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMULVF_FLOAT vfmul_vf_f32m4 -#define VFMSACVF_FLOAT vfmsac_vf_f32m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4) +#define VFMSACVF_FLOAT RISCV_RVV(vfmsac_vf_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) #else -#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT vlse64_v_f64m4 -#define VSSEV_FLOAT vsse64_v_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMULVF_FLOAT vfmul_vf_f64m4 -#define VFMSACVF_FLOAT vfmsac_vf_f64m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4) +#define VFMSACVF_FLOAT RISCV_RVV(vfmsac_vf_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) #endif int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i, FLOAT *y, BLASLONG inc_y) @@ -62,6 +62,82 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL stride_x = inc_x * 2 * sizeof(FLOAT); stride_y = inc_y * 2 * sizeof(FLOAT); + if (inc_x == 0 || inc_y == 0) { + + FLOAT temp; + BLASLONG inc_x2, inc_y2; + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + if ( beta_r == 0.0 && beta_i == 0.0) + { + if ( alpha_r == 0.0 && alpha_i == 0.0 ) + { + + while(i < n) + { + y[iy] = 0.0 ; + y[iy+1] = 0.0 ; + iy += inc_y2 ; + i++ ; + } + + } + else + { + + while(i < n) + { + y[iy] = ( alpha_r * x[ix] - alpha_i * x[ix+1] ) ; + y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix] ) ; + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + } + + + } + + } + else + { + if ( alpha_r == 0.0 && alpha_i == 0.0 ) + { + + while(i < n) + { + temp = ( beta_r * y[iy] - beta_i * y[iy+1] ) ; + y[iy+1] = ( beta_r * y[iy+1] + beta_i * y[iy] ) ; + y[iy] = temp; + iy += inc_y2 ; + i++ ; + } + + } + else + { + + while(i < n) + { + temp = ( alpha_r * x[ix] - alpha_i * x[ix+1] ) + ( beta_r * y[iy] - beta_i * y[iy+1] ) ; + y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix] ) + ( beta_r * y[iy+1] + beta_i * y[iy] ) ; + y[iy] = temp; + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + } + + + } + + + + } + return(0); + + } else { + if(beta_r == 0.0 && beta_i == 0.0){ if(alpha_r == 0.0 && alpha_i == 0.0){ if(inc_y == 1){ @@ -191,5 +267,6 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL } } return(0); + } } diff --git a/kernel/riscv64/zaxpy.c b/kernel/riscv64/zaxpy.c index 1dcaeac27..18b6315cb 100644 --- a/kernel/riscv64/zaxpy.c +++ b/kernel/riscv64/zaxpy.c @@ -44,7 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, BLASLONG inc_x2; BLASLONG inc_y2; - if ( n < 0 ) return(0); + if ( n <= 0 ) return(0); if ( da_r == 0.0 && da_i == 0.0 ) return(0); ix = 0; diff --git a/kernel/riscv64/zaxpy_rvv.c b/kernel/riscv64/zaxpy_rvv.c new file mode 100644 index 000000000..0db32df10 --- /dev/null +++ b/kernel/riscv64/zaxpy_rvv.c @@ -0,0 +1,190 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VSET_VX2 __riscv_vset_v_f32m4_f32m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4x2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VSET_VX2 __riscv_vset_v_f64m4_f64m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4x2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + if(n < 0) return(0); + if(da_r == 0.0 && da_i == 0.0) return(0); + + FLOAT_V_T vx0, vx1, vy0, vy1; + FLOAT_VX2_T vxx2, vyx2; + + if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { + vl = VSETVL(n); + + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + #if !defined(CONJ) + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #else + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #endif + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSEG_FLOAT(y, vyx2, vl); + } + + } else if (inc_x == 1) { + + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + #if !defined(CONJ) + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #else + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #endif + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); + } + + } else if (inc_y == 1) { + + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { + vl = VSETVL(n); + + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + #if !defined(CONJ) + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #else + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #endif + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSEG_FLOAT(y, vyx2, vl); + } + + } else { + + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + #if !defined(CONJ) + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #else + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #endif + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); + } + + } + + return(0); +} diff --git a/kernel/riscv64/zaxpy_vector.c b/kernel/riscv64/zaxpy_vector.c index 4ccfe4a81..1e766c5f4 100644 --- a/kernel/riscv64/zaxpy_vector.c +++ b/kernel/riscv64/zaxpy_vector.c @@ -28,27 +28,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT vlse32_v_f32m4 -#define VSSEV_FLOAT vsse32_v_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) #else -#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT vlse64_v_f64m4 -#define VSSEV_FLOAT vsse64_v_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { - BLASLONG i = 0, j = 0; - BLASLONG ix = 0,iy = 0; - if(n < 0) return(0); - if(da_r == 0.0 && da_i == 0.0) return(0); + BLASLONG i = 0, j = 0; + BLASLONG ix = 0,iy = 0; + if(n <= 0) return(0); + if(da_r == 0.0 && da_i == 0.0) return(0); unsigned int gvl = 0; BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); diff --git a/kernel/riscv64/zcopy.c b/kernel/riscv64/zcopy.c index 07fe584c5..b0f19efd5 100644 --- a/kernel/riscv64/zcopy.c +++ b/kernel/riscv64/zcopy.c @@ -43,7 +43,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) BLASLONG inc_x2; BLASLONG inc_y2; - if ( n < 0 ) return(0); + if ( n <= 0 ) return(0); inc_x2 = 2 * inc_x; inc_y2 = 2 * inc_y; diff --git a/kernel/riscv64/zcopy_rvv.c b/kernel/riscv64/zcopy_rvv.c new file mode 100644 index 000000000..13879f03b --- /dev/null +++ b/kernel/riscv64/zcopy_rvv.c @@ -0,0 +1,105 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL_M8(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T_M8 vfloat32m8_t +#define VLEV_FLOAT_M8 __riscv_vle32_v_f32m8 +#define VSEV_FLOAT_M8 __riscv_vse32_v_f32m8 + +#define VSETVL_M4(n) __riscv_vsetvl_e32m4(n) +#define FLOAT_VX2_T_M4 vfloat32m4x2_t +#define VLSEG_FLOAT_M4 __riscv_vlseg2e32_v_f32m4x2 +#define VSSEG_FLOAT_M4 __riscv_vsseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT_M4 __riscv_vlsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT_M4 __riscv_vssseg2e32_v_f32m4x2 +#else +#define VSETVL_M8(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T_M8 vfloat64m8_t +#define VLEV_FLOAT_M8 __riscv_vle64_v_f64m8 +#define VSEV_FLOAT_M8 __riscv_vse64_v_f64m8 + +#define VSETVL_M4(n) __riscv_vsetvl_e64m4(n) +#define FLOAT_VX2_T_M4 vfloat64m4x2_t +#define VLSEG_FLOAT_M4 __riscv_vlseg2e64_v_f64m4x2 +#define VSSEG_FLOAT_M4 __riscv_vsseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT_M4 __riscv_vlsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT_M4 __riscv_vssseg2e64_v_f64m4x2 +#endif + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + if(n <= 0) return(0); + + if(inc_x == 1 && inc_y == 1) { + + FLOAT_V_T_M8 vx; + n *= 2; // convert to words + + for(size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL_M8(n); + vx = VLEV_FLOAT_M8(x, vl); + VSEV_FLOAT_M8(y, vx, vl); + } + + }else if (1 == inc_x) { + + FLOAT_VX2_T_M4 vx2; + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { + vl = VSETVL_M4(n); + vx2 = VLSEG_FLOAT_M4(x, vl); + VSSSEG_FLOAT_M4(y, stride_y, vx2, vl); + } + } else if (1 == inc_y) { + + FLOAT_VX2_T_M4 vx2; + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { + vl = VSETVL_M4(n); + vx2 = VLSSEG_FLOAT_M4(x, stride_x, vl); + VSSEG_FLOAT_M4(y, vx2, vl); + } + } else { + + FLOAT_VX2_T_M4 vx2; + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { + vl = VSETVL_M4(n); + vx2 = VLSSEG_FLOAT_M4(x, stride_x, vl); + VSSSEG_FLOAT_M4(y, stride_y, vx2, vl); + } + } + + return(0); +} diff --git a/kernel/riscv64/zcopy_vector.c b/kernel/riscv64/zcopy_vector.c index 55a480a35..9e4a67b71 100644 --- a/kernel/riscv64/zcopy_vector.c +++ b/kernel/riscv64/zcopy_vector.c @@ -27,15 +27,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT vlse32_v_f32m4 -#define VSSEV_FLOAT vsse32_v_f32m4 +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) #else -#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT vlse64_v_f64m4 -#define VSSEV_FLOAT vsse64_v_f64m4 +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) #endif diff --git a/kernel/riscv64/zdot_rvv.c b/kernel/riscv64/zdot_rvv.c new file mode 100644 index 000000000..13bc2ee39 --- /dev/null +++ b/kernel/riscv64/zdot_rvv.c @@ -0,0 +1,194 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VFMSACVV_FLOAT __riscv_vfmsac_vv_f32m4 +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VFMSACVV_FLOAT __riscv_vfmsac_vv_f64m4 +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#endif + +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + OPENBLAS_COMPLEX_FLOAT result; + CREAL(result) = 0.0; + CIMAG(result) = 0.0; + + if ( n <= 0 ) return(result); + + FLOAT_V_T vr0, vr1, vx0, vx1, vy0, vy1; + FLOAT_V_T_M1 v_res, v_z0; + FLOAT_VX2_T vxx2, vyx2; + size_t vlmax_m1 = VSETVL_MAX_M1; + v_z0 = VFMVVF_FLOAT_M1(0, vlmax_m1); + + size_t vlmax = VSETVL_MAX; + vr0 = VFMVVF_FLOAT(0, vlmax); + vr1 = VFMVVF_FLOAT(0, vlmax); + + if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { + vl = VSETVL(n); + + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); + #if !defined(CONJ) + vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, vy0, vl); + #else + vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT_TU(vr1, vx1, vy0, vl); + #endif + } + + } else if (inc_x == 1){ + + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); + #if !defined(CONJ) + vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, vy0, vl); + #else + vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT_TU(vr1, vx1, vy0, vl); + #endif + } + } else if (inc_y == 1){ + + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { + vl = VSETVL(n); + + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); + #if !defined(CONJ) + vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, vy0, vl); + #else + vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT_TU(vr1, vx1, vy0, vl); + #endif + } + }else { + + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); + #if !defined(CONJ) + vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, vy0, vl); + #else + vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT_TU(vr1, vx1, vy0, vl); + #endif + } + } + + v_res = VFREDSUM_FLOAT(vr0, v_z0, vlmax); + CREAL(result) = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(vr1, v_z0, vlmax); + CIMAG(result) = VFMVFS_FLOAT_M1(v_res); + + return(result); +} diff --git a/kernel/riscv64/zdot_vector.c b/kernel/riscv64/zdot_vector.c index 0900206b3..13b8fe378 100644 --- a/kernel/riscv64/zdot_vector.c +++ b/kernel/riscv64/zdot_vector.c @@ -27,37 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle32_v_f32m4 -#define VLSEV_FLOAT vlse32_v_f32m4 -#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFDOTVV_FLOAT vfdot_vv_f32m4 -#define VFMULVV_FLOAT vfmul_vv_f32m4 -#define VFMSACVV_FLOAT vfmsac_vv_f32m4 -#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f32m1_f32) +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m4_f32m1)(v_res, va, vb, gvl) #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f32m4) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4) +#define VFMSACVV_FLOAT RISCV_RVV(vfmsac_vv_f32m4) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f32m4) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle64_v_f64m4 -#define VLSEV_FLOAT vlse64_v_f64m4 -#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFDOTVV_FLOAT vfdot_vv_f64m4 -#define VFMULVV_FLOAT vfmul_vv_f64m4 -#define VFMSACVV_FLOAT vfmsac_vv_f64m4 -#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f64m1_f64) +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m4_f64m1)(v_res, va, vb, gvl) +#else +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f64m4) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4) +#define VFMSACVV_FLOAT RISCV_RVV(vfmsac_vv_f64m4) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m4) #endif OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) @@ -109,9 +117,9 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA ix += inc_xv; iy += inc_yv; } - v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); dot[0] += VFMVFS_FLOAT(v_res); - v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); dot[1] += VFMVFS_FLOAT(v_res); //tail if(j < n){ @@ -132,9 +140,9 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA vr1 = VFMULVV_FLOAT(vx1, vy0, gvl); vr1 = VFMSACVV_FLOAT(vr1, vx0, vy1, gvl); #endif - v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); dot[0] += VFMVFS_FLOAT(v_res); - v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); dot[1] += VFMVFS_FLOAT(v_res); } CREAL(result) = dot[0]; diff --git a/kernel/riscv64/zgemm_beta_rvv.c b/kernel/riscv64/zgemm_beta_rvv.c new file mode 100644 index 000000000..ee334801b --- /dev/null +++ b/kernel/riscv64/zgemm_beta_rvv.c @@ -0,0 +1,130 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VSET_VX2 __riscv_vset_v_f32m4_f32m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 +#define VFSUBVV_FLOAT __riscv_vfsub_vv_f32m4 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VSET_VX2 __riscv_vset_v_f64m4_f64m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 +#define VFSUBVV_FLOAT __riscv_vfsub_vv_f64m4 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, + FLOAT beta_r, FLOAT beta_i, + FLOAT *dummy2, BLASLONG dummy3, + FLOAT *dummy4, BLASLONG dummy5, + FLOAT *c, BLASLONG ldc) +{ + BLASLONG chunk; + FLOAT *c_offset; + size_t vl; + FLOAT_V_T vr, vi, v1, v2, v3, v4; + FLOAT_VX2_T vx2; + + ldc *= 2; + c_offset = c; + + if (beta_r == 0.0 && beta_i == 0.0) { + + vl = VSETVL(m); + vr = VFMVVF_FLOAT(0.0, vl); + vi = VFMVVF_FLOAT(0.0, vl); + vx2 = VSET_VX2(vx2, 0, vr); + vx2 = VSET_VX2(vx2, 1, vi); + + for( ; n > 0; n--, c += ldc) { + c_offset = c; + + for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) { + vl = VSETVL(chunk); + + VSSEG_FLOAT(c_offset, vx2, vl); + } + } + + } else { + + for( ; n > 0; n--, c += ldc) { + c_offset = c; + + for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) { + vl = VSETVL(chunk); + + vx2 = VLSEG_FLOAT(c_offset, vl); + vr = VGET_VX2(vx2, 0); + vi = VGET_VX2(vx2, 1); + + v1 = VFMULVF_FLOAT(vr, beta_r, vl); + v2 = VFMULVF_FLOAT(vi, beta_i, vl); + + v3 = VFMULVF_FLOAT(vi, beta_r, vl); + v4 = VFMULVF_FLOAT(vr, beta_i, vl); + + vr = VFSUBVV_FLOAT(v1, v2, vl); + vi = VFADDVV_FLOAT(v3, v4, vl); + + vx2 = VSET_VX2(vx2, 0, vr); + vx2 = VSET_VX2(vx2, 1, vi); + VSSEG_FLOAT(c_offset, vx2, vl); + } + } + + } + + return 0; +} diff --git a/kernel/riscv64/zgemm_kernel_4x4_zvl128b.c b/kernel/riscv64/zgemm_kernel_4x4_zvl128b.c new file mode 100644 index 000000000..0776f03fd --- /dev/null +++ b/kernel/riscv64/zgemm_kernel_4x4_zvl128b.c @@ -0,0 +1,720 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=2 + M=4 + M_tail_scalar_from=2 + N=4 + __riscv_='__riscv_' + complex=True + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='gemm' + param_precision='double' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=64 + ELEN_PARAM=64 + LMUL_ACC=2 + VFMACC='__riscv_vfmacc_vf_f64m2' + VFMUL='__riscv_vfmul_vf_f64m2' + VLEV='__riscv_vle64_v_f64m2' + VLSEV='__riscv_vlse64_v_f64m2' + VMACC_TO_ACC='__riscv_vfmacc_vf_f64m2' + VMUL_TO_ACC='__riscv_vfmul_vf_f64m2' + VSETVL='__riscv_vsetvl_e64m2' + VSEV='__riscv_vse64_v_f64m2' + VSSEV='__riscv_vsse64_v_f64m2' + acc_vector_t='vfloat64m2_t' + output='zgemm_kernel_4x4_zvl128b.c' + param_scalar_t='double' + param_vector_t='vfloat64m2_t' + +*/ + +#include "common.h" + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define S0 1 +#define S1 -1 +#define S2 1 +#define S3 1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfmacc +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define S0 1 +#define S1 1 +#define S2 1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfmsac +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define S0 1 +#define S1 1 +#define S2 -1 +#define S3 1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfnmsac +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define S0 1 +#define S1 -1 +#define S2 -1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfnmacc +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 4; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e64m2(4); + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + double B1r = B[bi + 1 * 2 + 0]; + double B1i = B[bi + 1 * 2 + 1]; + double B2r = B[bi + 2 * 2 + 0]; + double B2i = B[bi + 2 * 2 + 1]; + double B3r = B[bi + 3 * 2 + 0]; + double B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k + // leaving 6 vector registers for temporaries + // performing 2 operations between reuses of temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + vfloat64m2_t ACC1r = tmp1r; + vfloat64m2_t ACC1i = tmp1i; + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + vfloat64m2_t ACC2r = tmp0r; + vfloat64m2_t ACC2i = tmp0i; + vfloat64m2_t ACC3r = tmp1r; + vfloat64m2_t ACC3i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + B2r = B[bi + 2 * 2 + 0]; + B2i = B[bi + 2 * 2 + 1]; + B3r = B[bi + 3 * 2 + 0]; + B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); + ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); + ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); + ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat64m2_t C1r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C1i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat64m2_t C2r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C2i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat64m2_t C3r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C3i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl); + C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl); + C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl); + C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); + C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); + C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); + C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); + + m_top += 4; + } + + // -- tails for main pass + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + double result8 = 0; + double result9 = 0; + double result10 = 0; + double result11 = 0; + double result12 = 0; + double result13 = 0; + double result14 = 0; + double result15 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1]; + result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1]; + result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1]; + result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1]; + ai += 2 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; + Cr += result8 * alphar; + Ci += result9 * alphar; + Cr -= result9 * alphai; + Ci += result8 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 1) * 2 + 1]; + Cr += result10 * alphar; + Ci += result11 * alphar; + Cr -= result11 * alphai; + Ci += result10 * alphai; + C[(ci + 2 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; + Cr += result12 * alphar; + Ci += result13 * alphar; + Cr -= result13 * alphai; + Ci += result12 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 1) * 2 + 1]; + Cr += result14 * alphar; + Ci += result15 * alphar; + Cr -= result15 * alphai; + Ci += result14 * alphai; + C[(ci + 3 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + ai += 1 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e64m2(4); + m_top = 0; + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + double B1r = B[bi + 1 * 2 + 0]; + double B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k + // leaving 10 vector registers for temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + vfloat64m2_t ACC1r = tmp1r; + vfloat64m2_t ACC1i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat64m2_t C1r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C1i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + ai += 2 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + ai += 1 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e64m2(4); + m_top = 0; + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k + // leaving 12 vector registers for temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + ai += 2 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + ai += 1 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/zgemm_kernel_8x4_zvl256b.c b/kernel/riscv64/zgemm_kernel_8x4_zvl256b.c new file mode 100644 index 000000000..ca33368f0 --- /dev/null +++ b/kernel/riscv64/zgemm_kernel_8x4_zvl256b.c @@ -0,0 +1,1253 @@ +/* + +AUTOGENERATED KERNEL +Settings: + LMUL=1 + M=8 + M_tail_scalar_from=1 + N=4 + __riscv_='__riscv_' + complex=True + conjugate=False + cpu='zvl256b' + force_acc_double=False + index_type='BLASLONG' + op='gemm' + param_precision='double' + reg_width_bits=256 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=64 + ELEN_PARAM=64 + LMUL_ACC=1 + VFMACC='__riscv_vfmacc_vf_f64m1' + VFMUL='__riscv_vfmul_vf_f64m1' + VLEV='__riscv_vle64_v_f64m1' + VLSEV='__riscv_vlse64_v_f64m1' + VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1' + VMUL_TO_ACC='__riscv_vfmul_vf_f64m1' + VSETVL='__riscv_vsetvl_e64m1' + VSEV='__riscv_vse64_v_f64m1' + VSSEV='__riscv_vsse64_v_f64m1' + acc_vector_t='vfloat64m1_t' + output='zgemm_kernel_8x4_zvl256b.c' + param_scalar_t='double' + param_vector_t='vfloat64m1_t' + +*/ + +#include "common.h" + + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + #define S0 1 + #define S1 -1 + #define S2 1 + #define S3 1 + #define VFMACC_RR __riscv_vfmsac + #define VFMACC_RI __riscv_vfmacc +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + #define S0 1 + #define S1 1 + #define S2 1 + #define S3 -1 + #define VFMACC_RR __riscv_vfmacc + #define VFMACC_RI __riscv_vfmsac +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + #define S0 1 + #define S1 1 + #define S2 -1 + #define S3 1 + #define VFMACC_RR __riscv_vfmacc + #define VFMACC_RI __riscv_vfnmsac +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + #define S0 1 + #define S1 -1 + #define S2 -1 + #define S3 -1 + #define VFMACC_RR __riscv_vfmsac + #define VFMACC_RI __riscv_vfnmacc +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + + // -- MAIN PASS + + for (BLASLONG j=0; j N ) + n_packing >>= 1; + + BLASLONG m_packing = UNROLL_M; + BLASLONG m_top = 0; + while (m_top < M) + { + while( m_top+m_packing > M ) + m_packing >>= 1; + + BLASLONG ai = K*m_top*2; + BLASLONG bi = K*n_top*2; + + BLASLONG pass_K = K; + + + #ifdef TRMMKERNEL + #ifdef LEFT + BLASLONG off = offset + m_top; + #else + BLASLONG off = -offset + n_top; + #endif + #ifdef BACKWARDS + ai += off * m_packing*2; + bi += off * n_packing*2; + pass_K -= off; + #else + #ifdef LEFT + pass_K = off + m_packing; + #else + pass_K = off + n_packing; + #endif + #endif + #endif + + memset( res, 0, UNROLL_M*UNROLL_N*2*sizeof(FLOAT) ); + + for (BLASLONG k=0; k> 2); j > 0; j--) { + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + for (i = m; i > 0; i -= vl) { + vl = VSETVL(i); + v1x2 = VLSEG2_FLOAT(aoffset1, vl); + v2x2 = VLSEG2_FLOAT(aoffset2, vl); + v3x2 = VLSEG2_FLOAT(aoffset3, vl); + v4x2 = VLSEG2_FLOAT(aoffset4, vl); + + vxx8 = VSET_VX8(vxx8, 0, VGET_VX2(v1x2, 0)); + vxx8 = VSET_VX8(vxx8, 1, VGET_VX2(v1x2, 1)); + vxx8 = VSET_VX8(vxx8, 2, VGET_VX2(v2x2, 0)); + vxx8 = VSET_VX8(vxx8, 3, VGET_VX2(v2x2, 1)); + vxx8 = VSET_VX8(vxx8, 4, VGET_VX2(v3x2, 0)); + vxx8 = VSET_VX8(vxx8, 5, VGET_VX2(v3x2, 1)); + vxx8 = VSET_VX8(vxx8, 6, VGET_VX2(v4x2, 0)); + vxx8 = VSET_VX8(vxx8, 7, VGET_VX2(v4x2, 1)); + + VSSEG8_FLOAT(boffset, vxx8, vl); + + aoffset1 += vl * 2; + aoffset2 += vl * 2; + aoffset3 += vl * 2; + aoffset4 += vl * 2; + boffset += vl * 8; + } + } + + if (n & 2) { + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + for (i = m; i > 0; i -= vl) { + vl = VSETVL(i); + v1x2 = VLSEG2_FLOAT(aoffset1, vl); + v2x2 = VLSEG2_FLOAT(aoffset2, vl); + + vxx4 = VSET_VX4(vxx4, 0, VGET_VX2(v1x2, 0)); + vxx4 = VSET_VX4(vxx4, 1, VGET_VX2(v1x2, 1)); + vxx4 = VSET_VX4(vxx4, 2, VGET_VX2(v2x2, 0)); + vxx4 = VSET_VX4(vxx4, 3, VGET_VX2(v2x2, 1)); + + VSSEG4_FLOAT(boffset, vxx4, vl); + + aoffset1 += vl * 2; + aoffset2 += vl * 2; + boffset += vl * 4; + } + } + + if (n & 1) { + aoffset1 = aoffset; + aoffset += lda; + + for (i = m; i > 0; i -= vl) { + vl = VSETVL(i); + v1x2 = VLSEG2_FLOAT(aoffset1, vl); + + VSSEG2_FLOAT(boffset, v1x2, vl); + + aoffset1 += vl * 2; + boffset += vl * 2; + } + } + + return 0; +} diff --git a/kernel/riscv64/zgemm_ncopy_rvv_v1.c b/kernel/riscv64/zgemm_ncopy_rvv_v1.c new file mode 100644 index 000000000..275daa5f2 --- /dev/null +++ b/kernel/riscv64/zgemm_ncopy_rvv_v1.c @@ -0,0 +1,74 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG i, j; + + FLOAT *a_offset; + FLOAT *a_offset1; + FLOAT *b_offset; + + FLOAT_VX2_T vx2; + size_t vl; + + //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); + a_offset = a; + b_offset = b; + + for(j = n; j > 0; j -= vl) { + vl = VSETVL(j); + + a_offset1 = a_offset; + a_offset += vl * lda * 2; + + for(i = m; i > 0; i--) { + vx2 = VLSSEG2_FLOAT(a_offset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG2_FLOAT(b_offset, vx2, vl); + + a_offset1 += 2; + b_offset += vl * 2; + } + } + return 0; +} + diff --git a/kernel/riscv64/zgemm_tcopy_4_rvv.c b/kernel/riscv64/zgemm_tcopy_4_rvv.c new file mode 100644 index 000000000..cfafbf0dc --- /dev/null +++ b/kernel/riscv64/zgemm_tcopy_4_rvv.c @@ -0,0 +1,191 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define FLOAT_VX2_T vfloat32m1x2_t +#define FLOAT_VX4_T vfloat32m1x4_t +#define FLOAT_VX8_T vfloat32m1x8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m1 +#define VSEV_FLOAT __riscv_vse32_v_f32m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1x2 +#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1x4 +#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1x8 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8 +#else +#define VSETVL(n) __riscv_vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define FLOAT_VX2_T vfloat64m1x2_t +#define FLOAT_VX4_T vfloat64m1x4_t +#define FLOAT_VX8_T vfloat64m1x8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m1 +#define VSEV_FLOAT __riscv_vse64_v_f64m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1x2 +#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1x4 +#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1x8 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1; + + IFLOAT *boffset, *boffset1, *boffset2, *boffset3; + + FLOAT_V_T v0; + FLOAT_VX2_T vx2; + FLOAT_VX4_T vx4; + FLOAT_VX8_T vx8; + + size_t vl; + + //fprintf(stderr, "%s m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); + + aoffset = a; + boffset = b; + boffset2 = b + 2 * m * (n & ~3); + boffset3 = b + 2 * m * (n & ~1); + + for(j = (m >> 2); j > 0; j--) { + + aoffset1 = aoffset; + aoffset += 8 * lda; + + boffset1 = boffset; + boffset += 32; + + for(i = (n >> 2); i > 0; i--) { + vl = 4; + + vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG8_FLOAT(boffset1, vx8, vl); + + aoffset1 += 8; + boffset1 += m * 8; + } + + if (n & 2) { + vl = 4; + + vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG4_FLOAT(boffset2, vx4, vl); + + aoffset1 += 4; + boffset2 += 16; + } + + if (n & 1) { + vl = 4; + + vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG2_FLOAT(boffset3, vx2, vl); + + aoffset1 += 2; + boffset3 += 8; + } + } + + if (m & 2) { + aoffset1 = aoffset; + aoffset += 4 * lda; + + boffset1 = boffset; + boffset += 16; + + for(i = (n >> 2); i > 0; i--) { + vl = 2; + + vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG8_FLOAT(boffset1, vx8, vl); + + aoffset1 += 8; + boffset1 += m * 8; + } + + if (n & 2) { + vl = 2; + + vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG4_FLOAT(boffset2, vx4, vl); + + aoffset1 += 4; + boffset2 += 8; + } + + if (n & 1) { + vl = 2; + + vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG2_FLOAT(boffset3, vx2, vl); + + //aoffset1 += 2; + boffset3 += 4; + } + } + + if (m & 1) { + aoffset1 = aoffset; + boffset1 = boffset; + + for(i = (n >> 2); i > 0; i--) { + vl = 8; + + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset1, v0, vl); + + aoffset1 += 8; + boffset1 += 8 * m; + } + + if (n & 2) { + vl = 4; + + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset2, v0, vl); + + aoffset1 += 4; + //boffset2 += 4; + } + + if (n & 1) { + *(boffset3) = *(aoffset1); + *(boffset3 + 1) = *(aoffset1 + 1); + } + } + + return 0; +} diff --git a/kernel/riscv64/zgemm_tcopy_rvv_v1.c b/kernel/riscv64/zgemm_tcopy_rvv_v1.c new file mode 100644 index 000000000..96e986502 --- /dev/null +++ b/kernel/riscv64/zgemm_tcopy_rvv_v1.c @@ -0,0 +1,74 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1; + IFLOAT *boffset; + + FLOAT_VX2_T vx2; + size_t vl; + + //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); + + aoffset = a; + boffset = b; + + for(j = n; j > 0; j -= vl) { + vl = VSETVL(j); + + aoffset1 = aoffset; + aoffset += vl * 2; + + for(i = m; i > 0; i--) { + vx2 = VLSEG2_FLOAT(aoffset1, vl); + VSSEG2_FLOAT(boffset, vx2, vl); + + aoffset1 += lda * 2; + boffset += vl * 2; + } + } + + return 0; +} diff --git a/kernel/riscv64/zgemmkernel_rvv_v1x4.c b/kernel/riscv64/zgemmkernel_rvv_v1x4.c new file mode 100644 index 000000000..77e012ff5 --- /dev/null +++ b/kernel/riscv64/zgemmkernel_rvv_v1x4.c @@ -0,0 +1,553 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFMACCVF_FLOAT +#define OP_ii VFNMSACVF_FLOAT +#define OP_ri VFMACCVF_FLOAT +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFMACCVF_FLOAT +#define OP_ii VFMACCVF_FLOAT +#define OP_ri VFNMSACVF_FLOAT +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFNMSACVF_FLOAT +#define OP_ii VFMACCVF_FLOAT +#define OP_ri VFMACCVF_FLOAT +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFNMSACVF_FLOAT +#define OP_ii VFNMSACVF_FLOAT +#define OP_ri VFNMSACVF_FLOAT +#endif + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0, *C1, *C2, *C3, *ptrba,*ptrbb; + + FLOAT_VX2_T vax2; + FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; + FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; + + //fprintf(stderr, "%s, bn=%ld bm=%ld bk=%ld alphar=%f alphai=%f ldc=%ld\n", __FUNCTION__, bn, bm, bk, alphar, alphai, ldc); // Debug + + size_t vl; + for (j = bn/4; j > 0; j--) + { + C0 = C; + C1 = C0 + 2 * ldc; + C2 = C1 + 2 * ldc; + C3 = C2 + 2 * ldc; + ptrba = ba; + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + vres4 = VFMVVF_FLOAT(0.0, vl); + vres5 = VFMVVF_FLOAT(0.0, vl); + vres6 = VFMVVF_FLOAT(0.0, vl); + vres7 = VFMVVF_FLOAT(0.0, vl); + + for (k = bk/4; k > 0; k--) + { + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va0, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va1, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va1, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va0, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va0, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va1, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va1, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va0, vl); + + ptrbb += 8; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va2, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va3, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va3, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va2, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va2, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va3, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va3, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va2, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va2, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va3, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va3, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va2, vl); + + ptrbb += 8; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va4, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va5, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va5, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va4, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va4, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va5, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va5, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va4, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va4, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va5, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va5, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va4, vl); + ptrbb += 8; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va6, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va7, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va7, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va6, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va6, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va7, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va7, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va6, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va6, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va7, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va7, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va6, vl); + + ptrbb += 8; + } + + for (k = (bk & 3); k > 0; k--) + { + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va0, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va1, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va1, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va0, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va0, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va1, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va1, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va0, vl); + + ptrbb += 8; + } + + vax2 = VLSEG2_FLOAT(C0, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + + vax2 = VLSEG2_FLOAT(C1, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); + + va0 = VFMACCVF_FLOAT(va0, alphar, vres0, vl); + va1 = VFMACCVF_FLOAT(va1, alphar, vres1, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); + + va2 = VFMACCVF_FLOAT(va2, alphar, vres2, vl); + va3 = VFMACCVF_FLOAT(va3, alphar, vres3, vl); + va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); + va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); + + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C1, vax2, vl); + + vax2 = VLSEG2_FLOAT(C2, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + + vax2 = VLSEG2_FLOAT(C3, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); + + va0 = VFMACCVF_FLOAT(va0, alphar, vres4, vl); + va1 = VFMACCVF_FLOAT(va1, alphar, vres5, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres5, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres4, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C2, vax2, vl); + + va2 = VFMACCVF_FLOAT(va2, alphar, vres6, vl); + va3 = VFMACCVF_FLOAT(va3, alphar, vres7, vl); + va2 = VFNMSACVF_FLOAT(va2, alphai, vres7, vl); + va3 = VFMACCVF_FLOAT(va3, alphai, vres6, vl); + + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C3, vax2, vl); + + C0 += vl * 2; + C1 += vl * 2; + C2 += vl * 2; + C3 += vl * 2; + } + + bb += (bk << 3); + C += (ldc << 3); + } + + if (bn & 2) + { + C0 = C; + C1 = C0 + 2 * ldc; + ptrba = ba; + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + + for (k = bk/4; k > 0; k--) + { + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + ptrbb += 4; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va2, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va3, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va3, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va2, vl); + + ptrbb += 4; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va4, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va5, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va5, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va4, vl); + + ptrbb += 4; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va6, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va7, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va7, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va6, vl); + + ptrbb += 4; + } + + for (k = (bk & 3); k > 0; k--) + { + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + ptrbb += 4; + } + + vax2 = VLSEG2_FLOAT(C0, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + + vax2 = VLSEG2_FLOAT(C1, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); + + va0 = VFMACCVF_FLOAT(va0, alphar, vres0, vl); + va1 = VFMACCVF_FLOAT(va1, alphar, vres1, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); + + va2 = VFMACCVF_FLOAT(va2, alphar, vres2, vl); + va3 = VFMACCVF_FLOAT(va3, alphar, vres3, vl); + va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); + va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); + + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C1, vax2, vl); + + C0 += vl * 2; + C1 += vl * 2; + } + + bb += (bk << 2); + C += (ldc << 2); + } + + if (bn & 1) + { + C0 = C; + ptrba = ba; + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + + for (k = bk/4; k > 0; k--) + { + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + ptrbb += 2; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl); + + ptrbb += 2; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl); + ptrbb += 2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl); + ptrbb += 2; + } + + for (k = (bk & 3); k > 0; k--) + { + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + ptrbb += 2; + } + + vax2 = VLSEG2_FLOAT(C0, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + + va0 = VFMACCVF_FLOAT(va0, alphar, vres0, vl); + va1 = VFMACCVF_FLOAT(va1, alphar, vres1, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); + C0 += vl * 2; + } + + bb += bk << 1; + C += ldc << 1; + } + return 0; +} + diff --git a/kernel/riscv64/zgemv_n_rvv.c b/kernel/riscv64/zgemv_n_rvv.c new file mode 100644 index 000000000..f14ef5ba8 --- /dev/null +++ b/kernel/riscv64/zgemv_n_rvv.c @@ -0,0 +1,192 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VSET_VX2 __riscv_vset_v_f32m4_f32m4x2 +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VSEV_FLOAT __riscv_vse32_v_f32m4 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4x2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VSET_VX2 __riscv_vset_v_f64m4_f64m4x2 +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VSEV_FLOAT __riscv_vse64_v_f64m4 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4x2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix; + FLOAT *a_ptr; + FLOAT temp_r, temp_i; + FLOAT_V_T va0, va1, vy0, vy1; + FLOAT_VX2_T vax2, vyx2; + + BLASLONG stride_y = inc_y * sizeof(FLOAT) * 2; + + BLASLONG inc_x2 = inc_x * 2; + BLASLONG lda2 = lda * 2; + if (inc_y == 1) + { + for (size_t vl; m > 0; m -= vl, a += vl*2, y += vl*2) { + vl = VSETVL(m); + a_ptr = a; + ix = 0; + vyx2 = VLSEG_FLOAT(y, vl); + + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + for(i = 0; i < n; i++){ +#if !defined(XCONJ) + temp_r = alpha_r * x[ix] - alpha_i * x[ix+1]; + temp_i = alpha_r * x[ix+1] + alpha_i * x[ix]; +#else + temp_r = alpha_r * x[ix] + alpha_i * x[ix+1]; + temp_i = alpha_r * x[ix+1] - alpha_i * x[ix]; +#endif + + vax2 = VLSEG_FLOAT(a_ptr, vl); + + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); +#if !defined(CONJ) +#if !defined(XCONJ) + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl); +#endif +#else +#if !defined(XCONJ) + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl); +#endif +#endif + a_ptr += lda2; + ix += inc_x2; + } + + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSEG_FLOAT(y, vyx2, vl); + } + + } + else + { + for (size_t vl; m > 0; m -= vl, a += vl*2, y += vl*inc_y*2) { + vl = VSETVL(m); + a_ptr = a; + ix = 0; + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + for(i = 0; i < n; i++){ +#if !defined(XCONJ) + temp_r = alpha_r * x[ix] - alpha_i * x[ix+1]; + temp_i = alpha_r * x[ix+1] + alpha_i * x[ix]; +#else + temp_r = alpha_r * x[ix] + alpha_i * x[ix+1]; + temp_i = alpha_r * x[ix+1] - alpha_i * x[ix]; +#endif + + vax2 = VLSEG_FLOAT(a_ptr, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); +#if !defined(CONJ) +#if !defined(XCONJ) + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl); +#endif +#else +#if !defined(XCONJ) + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl); +#endif +#endif + a_ptr += lda2; + ix += inc_x2; + } + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); + } + } + return(0); +} diff --git a/kernel/riscv64/zgemv_n_vector.c b/kernel/riscv64/zgemv_n_vector.c index 3095c28f9..104d3865d 100644 --- a/kernel/riscv64/zgemv_n_vector.c +++ b/kernel/riscv64/zgemv_n_vector.c @@ -27,23 +27,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle32_v_f32m4 -#define VLSEV_FLOAT vlse32_v_f32m4 -#define VSEV_FLOAT vse32_v_f32m4 -#define VSSEV_FLOAT vsse32_v_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) #else -#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle64_v_f64m4 -#define VLSEV_FLOAT vlse64_v_f64m4 -#define VSEV_FLOAT vse64_v_f64m4 -#define VSSEV_FLOAT vsse64_v_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) diff --git a/kernel/riscv64/zgemv_t_rvv.c b/kernel/riscv64/zgemv_t_rvv.c new file mode 100644 index 000000000..1c89a9f72 --- /dev/null +++ b/kernel/riscv64/zgemv_t_rvv.c @@ -0,0 +1,187 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VFREDSUM_FLOAT_TU __riscv_vfredusum_vs_f32m4_f32m1_tu +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VFREDSUM_FLOAT_TU __riscv_vfredusum_vs_f64m4_f64m1_tu +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i = 0, j = 0; + BLASLONG ix = 0, iy = 0; + FLOAT *a_ptr = a; + FLOAT temp_r, temp_i; + + FLOAT_V_T va0, va1, vx0, vx1, vr, vi; + FLOAT_V_T_M1 v_res, v_z0; + FLOAT_VX2_T vxx2, vax2; + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + //BLASLONG stride_a = sizeof(FLOAT) * 2; + BLASLONG inc_y2 = inc_y * 2; + BLASLONG lda2 = lda * 2; + + size_t vlmax = VSETVL_MAX_M1; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + vlmax = VSETVL(m); + + if (inc_x == 1) + { + for(i = 0; i < n; i++) { + j = 0; + ix = 0; + vr = VFMVVF_FLOAT(0, vlmax); + vi = VFMVVF_FLOAT(0, vlmax); + for(size_t vl, k = m; k > 0; k -= vl) { + vl = VSETVL(k); + + vax2 = VLSEG_FLOAT(&a_ptr[j], vl); + vxx2 = VLSEG_FLOAT(&x[ix], vl); + + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl); + vr = VFNMSACVV_FLOAT_TU(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl); + vi = VFMACCVV_FLOAT_TU(vi, va1, vx0, vl); +#else + vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl); + vr = VFMACCVV_FLOAT_TU(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl); + vi = VFNMSACVV_FLOAT_TU(vi, va1, vx0, vl); +#endif + j += vl * 2; + ix += vl * inc_x * 2; + } + + v_res = VFREDSUM_FLOAT_TU(v_res, vr, v_z0, vlmax); + temp_r = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT_TU(v_res, vi, v_z0, vlmax); + temp_i = VFMVFS_FLOAT_M1(v_res); + +#if !defined(XCONJ) + y[iy] += alpha_r * temp_r - alpha_i * temp_i; + y[iy+1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y[iy] += alpha_r * temp_r + alpha_i * temp_i; + y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + iy += inc_y2; + a_ptr += lda2; + } + } + else + { + for(i = 0; i < n; i++) { + j = 0; + ix = 0; + vr = VFMVVF_FLOAT(0, vlmax); + vi = VFMVVF_FLOAT(0, vlmax); + for(size_t vl, k = m; k > 0; k -= vl) { + vl = VSETVL(k); + + vax2 = VLSEG_FLOAT(&a_ptr[j], vl); + vxx2 = VLSSEG_FLOAT(&x[ix], stride_x, vl); + + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl); + vr = VFNMSACVV_FLOAT_TU(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl); + vi = VFMACCVV_FLOAT_TU(vi, va1, vx0, vl); +#else + vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl); + vr = VFMACCVV_FLOAT_TU(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl); + vi = VFNMSACVV_FLOAT_TU(vi, va1, vx0, vl); +#endif + j += vl * 2; + ix += vl * inc_x * 2; + } + + v_res = VFREDSUM_FLOAT_TU(v_res, vr, v_z0, vlmax); + temp_r = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT_TU(v_res, vi, v_z0, vlmax); + temp_i = VFMVFS_FLOAT_M1(v_res); + +#if !defined(XCONJ) + y[iy] += alpha_r * temp_r - alpha_i * temp_i; + y[iy+1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y[iy] += alpha_r * temp_r + alpha_i * temp_i; + y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + iy += inc_y2; + a_ptr += lda2; + } + + } + + + return(0); +} diff --git a/kernel/riscv64/zgemv_t_vector.c b/kernel/riscv64/zgemv_t_vector.c index a7a8a5279..5d85ab3a4 100644 --- a/kernel/riscv64/zgemv_t_vector.c +++ b/kernel/riscv64/zgemv_t_vector.c @@ -27,31 +27,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m4_t +#define VSETVL(n) RISCV_RVV(vsetvl_e32m2)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() +#define FLOAT_V_T vfloat32m2_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse32_v_f32m4 -#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMULVV_FLOAT vfmul_vv_f32m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f32m1_f32) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m2) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(vr, va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m2_f32m1)(vr, va, vb, gvl) #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m4_t +#define VFREDSUM_FLOAT(vr, va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m2_f32m1)(va, vb, gvl) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m2) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f32m2) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m2) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m2) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m2)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() +#define FLOAT_V_T vfloat64m2_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse64_v_f64m4 -#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMULVV_FLOAT vfmul_vv_f64m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f64m1_f64) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m2) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(vr, va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m2_f64m1)(vr, va, vb, gvl) +#else +#define VFREDSUM_FLOAT(vr, va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m2_f64m1)(va, vb, gvl) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m2) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m2) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m2) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m2) #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) @@ -62,49 +70,43 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT temp_r, temp_i; FLOAT_V_T va0, va1, vx0, vx1, vr, vi; - unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_z0; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_z0 = VFMVVF_FLOAT_M1(0, gvl); - + unsigned int gvl = VSETVL(m); + FLOAT_V_T_M1 v_res_r, v_res_i; BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; BLASLONG stride_a = sizeof(FLOAT) * 2; - gvl = VSETVL(m); BLASLONG inc_xv = inc_x * gvl * 2; BLASLONG inc_av = gvl * 2; BLASLONG inc_y2 = inc_y * 2; BLASLONG lda2 = lda * 2; + for(i = 0; i < n; i++){ + v_res_r = VFMVVF_FLOAT_M1(0, 1); + v_res_i = VFMVVF_FLOAT_M1(0, 1); gvl = VSETVL(m); j = 0; ix = 0; - vr = VFMVVF_FLOAT(0, gvl); - vi = VFMVVF_FLOAT(0, gvl); for(k = 0; k < m/gvl; k++){ va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl); va1 = VLSEV_FLOAT(&a_ptr[j+1], stride_a, gvl); vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - vr = VFMACCVV_FLOAT(vr, va0, vx0, gvl); + vr = VFMULVV_FLOAT(va0, vx0, gvl); + vi = VFMULVV_FLOAT(va0, vx1, gvl); vr = VFNMSACVV_FLOAT(vr, va1, vx1, gvl); - vi = VFMACCVV_FLOAT(vi, va0, vx1, gvl); vi = VFMACCVV_FLOAT(vi, va1, vx0, gvl); #else - vr = VFMACCVV_FLOAT(vr, va0, vx0, gvl); + vr = VFMULVV_FLOAT(va0, vx0, gvl); + vi = VFMULVV_FLOAT(va0, vx1, gvl); vr = VFMACCVV_FLOAT(vr, va1, vx1, gvl); - vi = VFMACCVV_FLOAT(vi, va0, vx1, gvl); vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl); - #endif + v_res_r = VFREDSUM_FLOAT(v_res_r, vr, v_res_r, gvl); + v_res_i = VFREDSUM_FLOAT(v_res_i, vi, v_res_i, gvl); + j += inc_av; ix += inc_xv; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp_r = VFMVFS_FLOAT(v_res); - v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, gvl); - temp_i = VFMVFS_FLOAT(v_res); if(j/2 < m){ gvl = VSETVL(m-j/2); va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl); @@ -113,21 +115,23 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) vr = VFMULVV_FLOAT(va0, vx0, gvl); - vr = VFNMSACVV_FLOAT(vr, va1, vx1, gvl); vi = VFMULVV_FLOAT(va0, vx1, gvl); + vr = VFNMSACVV_FLOAT(vr, va1, vx1, gvl); vi = VFMACCVV_FLOAT(vi, va1, vx0, gvl); #else vr = VFMULVV_FLOAT(va0, vx0, gvl); - vr = VFMACCVV_FLOAT(vr, va1, vx1, gvl); vi = VFMULVV_FLOAT(va0, vx1, gvl); + vr = VFMACCVV_FLOAT(vr, va1, vx1, gvl); vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl); #endif - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - temp_r += VFMVFS_FLOAT(v_res); - v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, gvl); - temp_i += VFMVFS_FLOAT(v_res); + v_res_r = VFREDSUM_FLOAT(v_res_r, vr, v_res_r, gvl); + v_res_i = VFREDSUM_FLOAT(v_res_i, vi, v_res_i, gvl); } + + temp_r = VFMVFS_FLOAT(v_res_r); + temp_i = VFMVFS_FLOAT(v_res_i); + #if !defined(XCONJ) y[iy] += alpha_r * temp_r - alpha_i * temp_i; y[iy+1] += alpha_r * temp_i + alpha_i * temp_r; diff --git a/kernel/riscv64/zhemm_ltcopy_rvv_v1.c b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c new file mode 100644 index 000000000..97013895a --- /dev/null +++ b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c @@ -0,0 +1,139 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define INT_V_T vint32m2_t +#define VID_V_INT __riscv_vid_v_i32m2 +#define VADD_VX_INT __riscv_vadd_vx_i32m2 +#define VFRSUB_VF_FLOAT __riscv_vfrsub_vf_f32m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 +#define VMSLT_VX_INT __riscv_vmslt_vx_i32m2_b16 +#define VMSEQ_VX_INT __riscv_vmseq_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define INT_V_T vint64m2_t +#define VID_V_INT __riscv_vid_v_i64m2 +#define VADD_VX_INT __riscv_vadd_vx_i64m2 +#define VFRSUB_VF_FLOAT __riscv_vfrsub_vf_f64m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 +#define VMSLT_VX_INT __riscv_vmslt_vx_i64m2_b32 +#define VMSEQ_VX_INT __riscv_vmseq_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#endif + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) +{ + //fprintf(stderr, "%s, %s, m=%ld n=%ld lda=%ld posX=%ld posY=%ld\n", __FUNCTION__, __FILE__, m, n, lda, posX, posY); + + BLASLONG i, js, offset; + + FLOAT *ao1, *ao2; + + BLASLONG stride_lda = sizeof(FLOAT) * lda * 2; + + FLOAT_V_T vb0, vb1, vb2, va10, va11, va20, va21, vzero; + FLOAT_VX2_T va1x2, va2x2, vbx2; + VBOOL_T vbool_gt0, vbool_lt0, vbool_eq0; + INT_V_T vindex_max, vindex; + + size_t vl = VSETVL_MAX; + vindex_max = VID_V_INT(vl); + vzero = VFMVVF_FLOAT(ZERO, vl); + + for (js = n; js > 0; js -= vl, posX += vl) { + vl = VSETVL(js); + offset = posX - posY; + + ao1 = a + posX * 2 + posY * lda * 2; + ao2 = a + posY * 2 + posX * lda * 2; + + for (i = m; i > 0; i--, offset--) { + va2x2 = VLSSEG2_FLOAT(ao2, stride_lda, vl); + va1x2 = VLSEG2_FLOAT(ao1, vl); + + va20 = VGET_VX2(va2x2, 0); + va21 = VGET_VX2(va2x2, 1); + va10 = VGET_VX2(va1x2, 0); + va11 = VGET_VX2(va1x2, 1); + + vindex = VADD_VX_INT(vindex_max, offset, vl); + vbool_gt0 = VMSGT_VX_INT(vindex, 0, vl); + vbool_lt0 = VMSLT_VX_INT(vindex, 0, vl); + vbool_eq0 = VMSEQ_VX_INT(vindex, 0, vl); + + vb0 = VMERGE_VVM_FLOAT(va20, va10, vbool_gt0, vl); + vb1 = VMERGE_VVM_FLOAT(va21, va11, vbool_gt0, vl); + + vb2 = VFRSUB_VF_FLOAT(vb1, ZERO, vl); + + vb1 = VMERGE_VVM_FLOAT(vb1, vb2, vbool_lt0, vl); + vb1 = VMERGE_VVM_FLOAT(vb1, vzero, vbool_eq0, vl); + + vbx2 = VSET_VX2(vbx2, 0, vb0); + vbx2 = VSET_VX2(vbx2, 1, vb1); + VSSEG2_FLOAT(b, vbx2, vl); + + b += vl * 2; + ao1 += lda * 2; + ao2 += 2; + } + } + + return 0; +} + diff --git a/kernel/riscv64/zhemm_utcopy_rvv_v1.c b/kernel/riscv64/zhemm_utcopy_rvv_v1.c new file mode 100644 index 000000000..59029e9e5 --- /dev/null +++ b/kernel/riscv64/zhemm_utcopy_rvv_v1.c @@ -0,0 +1,135 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define INT_V_T vint32m2_t +#define VID_V_INT __riscv_vid_v_i32m2 +#define VADD_VX_INT __riscv_vadd_vx_i32m2 +#define VFRSUB_VF_FLOAT __riscv_vfrsub_vf_f32m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 +#define VMSLT_VX_INT __riscv_vmslt_vx_i32m2_b16 +#define VMSEQ_VX_INT __riscv_vmseq_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define INT_V_T vint64m2_t +#define VID_V_INT __riscv_vid_v_i64m2 +#define VADD_VX_INT __riscv_vadd_vx_i64m2 +#define VFRSUB_VF_FLOAT __riscv_vfrsub_vf_f64m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 +#define VMSLT_VX_INT __riscv_vmslt_vx_i64m2_b32 +#define VMSEQ_VX_INT __riscv_vmseq_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#endif + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) +{ + BLASLONG i, js, offset; + + FLOAT *ao1, *ao2; + //fprintf(stderr, "%s, %s, m=%ld n=%ld lda=%ld posX=%ld posY=%ld\n", __FUNCTION__, __FILE__, m, n, lda, posX, posY); + BLASLONG stride_lda = sizeof(FLOAT) * lda * 2; + + FLOAT_V_T vb0, vb1, vb2, va10, va11, va20, va21, vzero; + FLOAT_VX2_T va1x2, va2x2, vbx2; + VBOOL_T vbool_gt0, vbool_eq0; + INT_V_T vindex_max, vindex; + + size_t vl = VSETVL_MAX; + vindex_max = VID_V_INT(vl); + vzero = VFMVVF_FLOAT(ZERO, vl); + + for (js = n; js > 0; js -= vl, posX += vl) { + vl = VSETVL(js); + offset = posX - posY; + + ao1 = a + posY * 2 + posX * lda * 2; + ao2 = a + posX * 2 + posY * lda * 2; + + for (i = m; i > 0; i--, offset--) { + va1x2 = VLSSEG2_FLOAT(ao1, stride_lda, vl); + va2x2 = VLSEG2_FLOAT(ao2, vl); + + va20 = VGET_VX2(va2x2, 0); + va21 = VGET_VX2(va2x2, 1); + va10 = VGET_VX2(va1x2, 0); + va11 = VGET_VX2(va1x2, 1); + + vindex = VADD_VX_INT(vindex_max, offset, vl); + vbool_gt0 = VMSGT_VX_INT(vindex, 0, vl); + vbool_eq0 = VMSEQ_VX_INT(vindex, 0, vl); + + vb0 = VMERGE_VVM_FLOAT(va20, va10, vbool_gt0, vl); + vb1 = VMERGE_VVM_FLOAT(va21, va11, vbool_gt0, vl); + + vb2 = VFRSUB_VF_FLOAT(vb1, ZERO, vl); + + vb1 = VMERGE_VVM_FLOAT(vb1, vb2, vbool_gt0, vl); + vb1 = VMERGE_VVM_FLOAT(vb1, vzero, vbool_eq0, vl); + + vbx2 = VSET_VX2(vbx2, 0, vb0); + vbx2 = VSET_VX2(vbx2, 1, vb1); + VSSEG2_FLOAT(b, vbx2, vl); + + b += vl * 2; + ao1 += 2; + ao2 += lda * 2; + } + } + + return 0; +} diff --git a/kernel/riscv64/zhemv_LM_rvv.c b/kernel/riscv64/zhemv_LM_rvv.c new file mode 100644 index 000000000..95c6a377c --- /dev/null +++ b/kernel/riscv64/zhemv_LM_rvv.c @@ -0,0 +1,198 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ + BLASLONG i, j, k; + BLASLONG ix, iy, ia; + BLASLONG jx, jy, ja; + FLOAT temp_r1, temp_i1; + FLOAT temp_r2, temp_i2; + FLOAT *a_ptr = a; + unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + + FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1; + BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, len, lda2; + + BLASLONG inc_x2 = incx * 2; + BLASLONG inc_y2 = incy * 2; + stride_x = inc_x2 * sizeof(FLOAT); + stride_y = inc_y2 * sizeof(FLOAT); + stride_a = 2 * sizeof(FLOAT); + lda2 = lda * 2; + + jx = 0; + jy = 0; + ja = 0; + for(j = 0; j < offset; j++){ + temp_r1 = alpha_r * x[jx] - alpha_i * x[jx+1];; + temp_i1 = alpha_r * x[jx+1] + alpha_i * x[jx]; + temp_r2 = 0; + temp_i2 = 0; + y[jy] += temp_r1 * a_ptr[ja]; + y[jy+1] += temp_i1 * a_ptr[ja]; + ix = jx + inc_x2; + iy = jy + inc_y2; + ia = ja + 2; + i = j + 1; + len = m - i; + if(len > 0){ + gvl = VSETVL(len); + inc_xv = incx * gvl * 2; + inc_yv = incy * gvl * 2; + inc_av = gvl * 2; + vr0 = VFMVVF_FLOAT(0, gvl); + vr1 = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < len / gvl; k++){ + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); +#ifndef HEMVREV + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#endif + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); +#ifndef HEMVREV + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); + vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); + vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); +#else + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); + vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); + +#endif + i += gvl; + ix += inc_xv; + iy += inc_yv; + ia += inc_av; + } + + if(i < m){ + unsigned int gvl_rem = VSETVL(m-i); + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl_rem); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl_rem); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl_rem); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl_rem); +#ifndef HEMVREV + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl_rem); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl_rem); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl_rem); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl_rem); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem); +#endif + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl_rem); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl_rem); + + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl_rem); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl_rem); +#ifndef HEMVREV + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, va1, gvl_rem); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem); + vr1 = VFNMSACVV_FLOAT_TU(vr1, vx0, va1, gvl_rem); +#else + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem); + vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, va1, gvl_rem); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, va1, gvl_rem); +#endif + } + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); + temp_r2 = VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); + temp_i2 = VFMVFS_FLOAT(v_res); + } + y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2; + jx += inc_x2; + jy += inc_y2; + ja += 2; + a_ptr += lda2; + } + return(0); +} diff --git a/kernel/riscv64/zhemv_LM_vector.c b/kernel/riscv64/zhemv_LM_vector.c index 0a284a999..117db7d84 100644 --- a/kernel/riscv64/zhemv_LM_vector.c +++ b/kernel/riscv64/zhemv_LM_vector.c @@ -27,37 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse32_v_f32m4 -#define VSSEV_FLOAT vsse32_v_f32m4 -#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMULVV_FLOAT vfmul_vv_f32m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 -#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f32m1_f32) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m4_f32m1)(v_res, va, vb, gvl) #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f32m4) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse64_v_f64m4 -#define VSSEV_FLOAT vsse64_v_f64m4 -#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMULVV_FLOAT vfmul_vv_f64m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 -#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f64m1_f64) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m4_f64m1)(v_res, va, vb, gvl) +#else +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m4) #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ @@ -143,9 +151,9 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B iy += inc_yv; ia += inc_av; } - v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); temp_r2 = VFMVFS_FLOAT(v_res); - v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); temp_i2 = VFMVFS_FLOAT(v_res); if(i < m){ gvl = VSETVL(m-i); @@ -181,9 +189,9 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); #endif - v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); temp_r2 += VFMVFS_FLOAT(v_res); - v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); temp_i2 += VFMVFS_FLOAT(v_res); } } diff --git a/kernel/riscv64/zhemv_UV_rvv.c b/kernel/riscv64/zhemv_UV_rvv.c new file mode 100644 index 000000000..ec06622fc --- /dev/null +++ b/kernel/riscv64/zhemv_UV_rvv.c @@ -0,0 +1,199 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ + BLASLONG i, j, k; + BLASLONG ix, iy, ia; + BLASLONG jx, jy, ja; + FLOAT temp_r1, temp_i1; + FLOAT temp_r2, temp_i2; + FLOAT *a_ptr = a; + unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + + FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1; + BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, lda2; + + BLASLONG inc_x2 = incx * 2; + BLASLONG inc_y2 = incy * 2; + stride_x = inc_x2 * sizeof(FLOAT); + stride_y = inc_y2 * sizeof(FLOAT); + stride_a = 2 * sizeof(FLOAT); + lda2 = lda * 2; + + BLASLONG m1 = m - offset; + a_ptr = a + m1 * lda2; + jx = m1 * inc_x2; + jy = m1 * inc_y2; + ja = m1 * 2; + for(j = m1; j < m; j++){ + temp_r1 = alpha_r * x[jx] - alpha_i * x[jx+1];; + temp_i1 = alpha_r * x[jx+1] + alpha_i * x[jx]; + temp_r2 = 0; + temp_i2 = 0; + ix = 0; + iy = 0; + ia = 0; + i = 0; + if(j > 0){ + gvl = VSETVL(j); + inc_xv = incx * gvl * 2; + inc_yv = incy * gvl * 2; + inc_av = gvl * 2; + vr0 = VFMVVF_FLOAT(0, gvl); + vr1 = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < j / gvl; k++){ + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); +#ifndef HEMVREV + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#endif + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); +#ifndef HEMVREV + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); + vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); + vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); +#else + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); + vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); + +#endif + i += gvl; + ix += inc_xv; + iy += inc_yv; + ia += inc_av; + } + + if(i < j){ + unsigned int gvl_rem = VSETVL(j-i); + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl_rem); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl_rem); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl_rem); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl_rem); +#ifndef HEMVREV + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl_rem); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl_rem); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl_rem); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl_rem); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem); +#endif + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl_rem); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl_rem); + + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl_rem); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl_rem); +#ifndef HEMVREV + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, va1, gvl_rem); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem); + vr1 = VFNMSACVV_FLOAT_TU(vr1, vx0, va1, gvl_rem); +#else + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem); + vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, va1, gvl_rem); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, va1, gvl_rem); +#endif + } + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); + temp_r2 = VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); + temp_i2 = VFMVFS_FLOAT(v_res); + } + y[jy] += temp_r1 * a_ptr[ja]; + y[jy+1] += temp_i1 * a_ptr[ja]; + y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2; + jx += inc_x2; + jy += inc_y2; + ja += 2; + a_ptr += lda2; + } + return(0); +} diff --git a/kernel/riscv64/zhemv_UV_vector.c b/kernel/riscv64/zhemv_UV_vector.c index 33b7c9c25..7c6b63bf3 100644 --- a/kernel/riscv64/zhemv_UV_vector.c +++ b/kernel/riscv64/zhemv_UV_vector.c @@ -27,37 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse32_v_f32m4 -#define VSSEV_FLOAT vsse32_v_f32m4 -#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMULVV_FLOAT vfmul_vv_f32m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 -#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f32m1_f32) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m4_f32m1)(v_res, va, vb, gvl) #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f32m4) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse64_v_f64m4 -#define VSSEV_FLOAT vsse64_v_f64m4 -#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMULVV_FLOAT vfmul_vv_f64m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 -#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f64m1_f64) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m4_f64m1)(v_res, va, vb, gvl) +#else +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m4) #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ @@ -142,9 +150,9 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B iy += inc_yv; ia += inc_av; } - v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); temp_r2 = VFMVFS_FLOAT(v_res); - v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); temp_i2 = VFMVFS_FLOAT(v_res); if(i < j){ gvl = VSETVL(j-i); @@ -180,9 +188,9 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); #endif - v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); temp_r2 += VFMVFS_FLOAT(v_res); - v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); temp_i2 += VFMVFS_FLOAT(v_res); } } diff --git a/kernel/riscv64/znrm2_rvv.c b/kernel/riscv64/znrm2_rvv.c new file mode 100644 index 000000000..32f67758a --- /dev/null +++ b/kernel/riscv64/znrm2_rvv.c @@ -0,0 +1,275 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define MASK_T vbool8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1_tu +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VMFIRSTM __riscv_vfirst_m_b8 +#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f32m4_f32m1_tu +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 +#define VMFGTVF_FLOAT __riscv_vmfgt_vf_f32m4_b8 +#define VFDIVVF_FLOAT __riscv_vfdiv_vf_f32m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define MASK_T vbool16_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1_tu +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VMFIRSTM __riscv_vfirst_m_b16 +#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f64m4_f64m1_tu +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 +#define VMFGTVF_FLOAT __riscv_vmfgt_vf_f64m4_b16 +#define VFDIVVF_FLOAT __riscv_vfdiv_vf_f64m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + + if (n <= 0 || inc_x <= 0) return(0.0); + + FLOAT_V_T vr, v0, v_zero; + unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + + FLOAT scale = 0.0, ssq = 0.0; + MASK_T mask; + BLASLONG index = 0; + if (inc_x == 1) { + BLASLONG n2 = n * 2; + gvl = VSETVL(n2); + vr = VFMVVF_FLOAT(0, gvl); + v_zero = VFMVVF_FLOAT(0, gvl); + for (i=0,j=0; i 0 ){ // scale change? + // find largest element in v0 and v1 + v_res = VFREDMAX( v0, v_z0, gvl ); + v_res = VFREDMAX( v1, v_res, gvl ); + FLOAT const largest_elt = EXTRACT_FLOAT( v_res ); + + v_scale = VFDIV( v_scale, largest_elt, gvl ); // scale/largest_elt + v_scale = VFMUL( v_scale, v_scale, gvl ); // (scale/largest_elt)*(scale/largest_elt) + v_ssq = VFMUL( v_scale, v_ssq, gvl ); // ssq*(scale/largest_elt)*(scale/largest_elt) + + v_scale = VFMVVF_FLOAT( largest_elt, gvl ); // splated largest_elt becomes new scale } - //ssq in vector vr: vr[0] - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - //total ssq now - ssq += VFMVFS_FLOAT(v_res); - //tail - if(j < n){ - gvl = VSETVL(n-j); - v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(v0, 0, gvl); - v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); - //if scale change - mask = VMFGTVF_FLOAT(v0, scale, gvl); - index = VMFIRSTM(mask, gvl); - if(index == -1){//no elements greater than scale - if(scale != 0.0){ - v0 = VFDIVVF_FLOAT(v0, scale, gvl); - vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); + MASK_T nonzero_mask0 = VMFNE( v0, 0, gvl ); + MASK_T nonzero_mask1 = VMFNE( v1, 0, gvl ); + v0 = VFDIV_M( nonzero_mask0, v_zero, v0, v_scale, gvl ); + v1 = VFDIV_M( nonzero_mask1, v_zero, v1, v_scale, gvl ); + v_ssq = VFMACC_M( nonzero_mask0, v_ssq, v0, v0, gvl ); + v_ssq = VFMACC_M( nonzero_mask1, v_ssq, v1, v1, gvl ); + + idx += inc_x * gvl * 2; + } + + v_res = VFREDUSUM(v_ssq, v_z0, gvl); + FLOAT ssq = EXTRACT_FLOAT(v_res); + FLOAT scale = EXTRACT_FLOAT0_V(v_scale); + + //finish any tail using scalar ops + i*=gvl; + if(i 0; n -= vl, x += vl*2, y += vl*2) { + vl = VSETVL(n); + + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + vt0 = VFMULVF_FLOAT(vx0, c, vl); + vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); + vt1 = VFMULVF_FLOAT(vx1, c, vl); + vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl); + vy0 = VFMULVF_FLOAT(vy0, c, vl); + vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl); + vy1 = VFMULVF_FLOAT(vy1, c, vl); + vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); + + vtx2 = VSET_VX2(vtx2, 0, vt0); + vtx2 = VSET_VX2(vtx2, 1, vt1); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + + VSSEG_FLOAT(x, vtx2, vl); + VSSEG_FLOAT(y, vyx2, vl); + } + + } else if (inc_x == 1){ + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + vt0 = VFMULVF_FLOAT(vx0, c, vl); + vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); + vt1 = VFMULVF_FLOAT(vx1, c, vl); + vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl); + vy0 = VFMULVF_FLOAT(vy0, c, vl); + vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl); + vy1 = VFMULVF_FLOAT(vy1, c, vl); + vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); + + vtx2 = VSET_VX2(vtx2, 0, vt0); + vtx2 = VSET_VX2(vtx2, 1, vt1); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + + VSSEG_FLOAT(x, vtx2, vl); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); + } + + } else if (inc_y == 1){ + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { + vl = VSETVL(n); + + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + vt0 = VFMULVF_FLOAT(vx0, c, vl); + vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); + vt1 = VFMULVF_FLOAT(vx1, c, vl); + vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl); + vy0 = VFMULVF_FLOAT(vy0, c, vl); + vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl); + vy1 = VFMULVF_FLOAT(vy1, c, vl); + vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); + + vtx2 = VSET_VX2(vtx2, 0, vt0); + vtx2 = VSET_VX2(vtx2, 1, vt1); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + + VSSSEG_FLOAT(x, stride_x, vtx2, vl); + VSSEG_FLOAT(y, vyx2, vl); + } + + } else { + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + + vt0 = VFMULVF_FLOAT(vx0, c, vl); + vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); + vt1 = VFMULVF_FLOAT(vx1, c, vl); + vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl); + vy0 = VFMULVF_FLOAT(vy0, c, vl); + vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl); + vy1 = VFMULVF_FLOAT(vy1, c, vl); + vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); + + vtx2 = VSET_VX2(vtx2, 0, vt0); + vtx2 = VSET_VX2(vtx2, 1, vt1); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + + VSSSEG_FLOAT(x, stride_x, vtx2, vl); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); + } + } + + return 0; +} diff --git a/kernel/riscv64/zrot_vector.c b/kernel/riscv64/zrot_vector.c index 727d13a87..50751b343 100644 --- a/kernel/riscv64/zrot_vector.c +++ b/kernel/riscv64/zrot_vector.c @@ -27,27 +27,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle32_v_f32m4 -#define VLSEV_FLOAT vlse32_v_f32m4 -#define VSEV_FLOAT vse32_v_f32m4 -#define VSSEV_FLOAT vsse32_v_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMULVF_FLOAT vfmul_vf_f32m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle64_v_f64m4 -#define VLSEV_FLOAT vlse64_v_f64m4 -#define VSEV_FLOAT vse64_v_f64m4 -#define VSSEV_FLOAT vsse64_v_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMULVF_FLOAT vfmul_vf_f64m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) #endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) @@ -59,7 +59,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT unsigned int gvl = 0; FLOAT_V_T vt0, vt1, vx0, vx1, vy0, vy1; - gvl = VSETVL(n); + gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); BLASLONG inc_xv = inc_x * 2 * gvl; @@ -112,7 +112,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT } }else{ - if (inc_x == 0 && inc_y == 0) gvl = VSETVL(1); for(i=0,j=0; i < n/gvl; i++){ vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); diff --git a/kernel/riscv64/zscal.c b/kernel/riscv64/zscal.c index 0521aaa0b..b2d537d04 100644 --- a/kernel/riscv64/zscal.c +++ b/kernel/riscv64/zscal.c @@ -60,6 +60,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F else { temp = - da_i * x[ip+1] ; + if (isnan(x[ip]) || isinf(x[ip])) temp = NAN; x[ip+1] = da_i * x[ip] ; } } diff --git a/kernel/riscv64/zscal_rvv.c b/kernel/riscv64/zscal_rvv.c new file mode 100644 index 000000000..ae79d9f9d --- /dev/null +++ b/kernel/riscv64/zscal_rvv.c @@ -0,0 +1,112 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VSET_VX2 __riscv_vset_v_f32m4_f32m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4x2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VSET_VX2 __riscv_vset_v_f64m4_f64m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4x2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + + if((n <= 0) || (inc_x <= 0)) return(0); + + FLOAT_V_T vt, vr, vi; + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + size_t vlmax = VSETVL_MAX; + FLOAT_VX2_T vx2; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + vx2 = VLSEG_FLOAT(x, vl); + vr = VGET_VX2(vx2, 0); + vi = VGET_VX2(vx2, 1); + + vt = VFMULVF_FLOAT(vr, da_r, vl); + vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); + vi = VFMULVF_FLOAT(vi, da_r, vl); + vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); + + vx2 = VSET_VX2(vx2, 0, vt); + vx2 = VSET_VX2(vx2, 1, vi); + VSSEG_FLOAT(x, vx2, vl); + } + + } else { + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + vx2 = VLSSEG_FLOAT(x, stride_x, vl); + vr = VGET_VX2(vx2, 0); + vi = VGET_VX2(vx2, 1); + + vt = VFMULVF_FLOAT(vr, da_r, vl); + vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); + vi = VFMULVF_FLOAT(vi, da_r, vl); + vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); + + vx2 = VSET_VX2(vx2, 0, vt); + vx2 = VSET_VX2(vx2, 1, vi); + VSSSEG_FLOAT(x, stride_x, vx2, vl); + } + } + + return(0); +} diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c index d275b75f8..536bbdf73 100644 --- a/kernel/riscv64/zscal_vector.c +++ b/kernel/riscv64/zscal_vector.c @@ -27,25 +27,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT vlse32_v_f32m4 -#define VSSEV_FLOAT vsse32_v_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMULVF_FLOAT vfmul_vf_f32m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT vlse64_v_f64m4 -#define VSSEV_FLOAT vsse64_v_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMULVF_FLOAT vfmul_vf_f64m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) @@ -59,84 +59,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F unsigned int gvl = 0; FLOAT_V_T vt, v0, v1; - if(da_r == 0.0 && da_i == 0.0){ - gvl = VSETVL(n); - BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); - BLASLONG inc_xv = inc_x * 2 * gvl; - vt = VFMVVF_FLOAT(0.0, gvl); - for(i=0,j=0; i < n/(gvl*2); i++){ - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+inc_xv], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+inc_xv+1], stride_x, vt, gvl); - - j += gvl*2; - ix += inc_xv*2; - } - for(; j < n; ){ - gvl = VSETVL(n-j); - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl); - j += gvl; - ix += inc_x * 2 * gvl; - } - }else if(da_r == 0.0){ - gvl = VSETVL(n); - BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); - BLASLONG inc_xv = inc_x * 2 * gvl; - for(i=0,j=0; i < n/gvl; i++){ - v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - - vt = VFMULVF_FLOAT(v1, -da_i, gvl); - v1 = VFMULVF_FLOAT(v0, da_i, gvl); - - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); - - j += gvl; - ix += inc_xv; - } - if(j < n){ - gvl = VSETVL(n-j); - v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - - vt = VFMULVF_FLOAT(v1, -da_i, gvl); - v1 = VFMULVF_FLOAT(v0, da_i, gvl); - - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); - } - }else if(da_i == 0.0){ - gvl = VSETVL(n); - BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); - BLASLONG inc_xv = inc_x * 2 * gvl; - for(i=0,j=0; i < n/gvl; i++){ - v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - - vt = VFMULVF_FLOAT(v0, da_r, gvl); - v1 = VFMULVF_FLOAT(v1, da_r, gvl); - - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); - - j += gvl; - ix += inc_xv; - } - if(j < n){ - gvl = VSETVL(n-j); - v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - - vt = VFMULVF_FLOAT(v0, da_r, gvl); - v1 = VFMULVF_FLOAT(v1, da_r, gvl); - - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); - } - }else{ + { gvl = VSETVL(n); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); BLASLONG inc_xv = inc_x * 2 * gvl; diff --git a/kernel/riscv64/zsum_rvv.c b/kernel/riscv64/zsum_rvv.c new file mode 100644 index 000000000..489188bd5 --- /dev/null +++ b/kernel/riscv64/zsum_rvv.c @@ -0,0 +1,107 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f32m4_tu +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f64m4_tu +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT sumf = 0.0; + if (n <= 0 || inc_x <= 0) return(sumf); + + FLOAT_V_T v0, v1; + FLOAT_VX2_T vx2; + size_t vlmax = VSETVL_MAX; + FLOAT_V_T v_sum = VFMVVF_FLOAT(0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + vx2 = VLSEG_FLOAT(x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); + + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl); + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + vx2 = VLSSEG_FLOAT(x, stride_x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); + + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl); + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl); + } + + } + + FLOAT_V_T_M1 v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_res = VFREDSUMVS_FLOAT(v_sum, v_res, vlmax); + sumf += VFMVFS_FLOAT_M1(v_res); + + return(sumf); +} diff --git a/kernel/riscv64/zsum_vector.c b/kernel/riscv64/zsum_vector.c new file mode 100644 index 000000000..ca0b02b5c --- /dev/null +++ b/kernel/riscv64/zsum_vector.c @@ -0,0 +1,131 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN _b32 +# else +# define ELEN 32 +# define MLEN _b16 +# endif +#else +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN _b8 +# else +# define ELEN 32 +# define MLEN _b4 +# endif +#endif + +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) +#define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) +#define VMFLTVF_FLOAT JOIN(RISCV_RVV(vmflt), _vf_f, ELEN, LMUL, MLEN) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + BLASLONG ix=0; + FLOAT asumf=0.0; + if (n <= 0 || inc_x <= 0) return(asumf); + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_zero,v_sum; + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(0, 1); + + if(inc_x == 1){ + BLASLONG n2 = n * 2; + gvl = VSETVL(n2); + v_zero = VFMVVF_FLOAT(0, gvl); + if(gvl <= n2/2){ + v_sum = VFMVVF_FLOAT(0, gvl); + for(i=0,j=0; i 0; m -= vl * 2, ptr -= vl*inc_y * 2) { + vl = VSETVL(m); + vyx2 = VLSSEG_FLOAT(ptr - 2, stride_y, vl); + VSSSEG_FLOAT(ptr, stride_y, vyx2, vl); + } + y[0] = temp[0]; + y[1] = temp[1]; + } + else if(inc_y == 0) { + FLOAT temp[2]; + temp[0] = y[0]; + temp[1] = y[1]; + y[0] = x[(n - 1) * inc_x * 2]; + y[0] = x[(n - 1) * inc_x * 2 + 1]; + FLOAT* ptr = x + (n - 1) * inc_x * 2; // start from the last one + BLASLONG stride_x = (0 - inc_x) * sizeof(FLOAT) * 2; // reverse + BLASLONG m = n - 1; + for (size_t vl; m > 0; m -= vl * 2, ptr -= vl*inc_x * 2) { + vl = VSETVL(m); + vxx2 = VLSSEG_FLOAT(ptr - 2, stride_x, vl); + VSSSEG_FLOAT(ptr, stride_x, vxx2, vl); + } + x[0] = temp[0]; + x[1] = temp[1]; + } + else if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { + vl = VSETVL(n); + + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + VSSEG_FLOAT(y, vxx2, vl); + VSSEG_FLOAT(x, vyx2, vl); + } + + } else if (inc_x == 1){ + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + VSSSEG_FLOAT(y, stride_y, vxx2, vl); + VSSEG_FLOAT(x, vyx2, vl); + } + + } else if (inc_y == 1){ + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { + vl = VSETVL(n); + + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + VSSEG_FLOAT(y, vxx2, vl); + VSSSEG_FLOAT(x, stride_x, vyx2, vl); + } + + } else { + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + VSSSEG_FLOAT(y, stride_y, vxx2, vl); + VSSSEG_FLOAT(x, stride_x, vyx2, vl); + } + + } + + return(0); +} diff --git a/kernel/riscv64/zswap_vector.c b/kernel/riscv64/zswap_vector.c index 09cc8992a..f2734c4a9 100644 --- a/kernel/riscv64/zswap_vector.c +++ b/kernel/riscv64/zswap_vector.c @@ -27,35 +27,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 64 +# else +# define ELEN 32 +# define MLEN 32 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 16 +# else +# define ELEN 32 +# define MLEN 8 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) +#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) + int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i = 0, j = 0; BLASLONG ix = 0,iy = 0; BLASLONG stride_x, stride_y; FLOAT_V_T vx0, vx1, vy0, vy1; - unsigned int gvl = 0; + unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1); + if( inc_x == 0 && inc_y == 0 ) { n = n & 1; } - if (n < 0) return(0); + if (n <= 0) return(0); if(inc_x == 1 && inc_y == 1){ - gvl = VSETVL(n); BLASLONG n2 = n * 2; if(gvl <= n2/2){ for(i=0,j=0; i 0; js -= vl, posX += vl) { + vl = VSETVL(js); + offset = posX - posY; + + ao1 = a + posX * 2 + posY * lda * 2; + ao2 = a + posY * 2 + (posX) * lda * 2; + + for (i = m; i > 0; i--, offset--) { + + va2x2 = VLSSEG2_FLOAT(ao2, stride_lda, vl); + va1x2 = VLSEG2_FLOAT(ao1, vl); + + va20 = VGET_VX2(va2x2, 0); + va21 = VGET_VX2(va2x2, 1); + va10 = VGET_VX2(va1x2, 0); + va11 = VGET_VX2(va1x2, 1); + + vindex = VADD_VX_INT(vindex_max, offset, vl); + vbool = VMSGT_VX_INT(vindex, 0, vl); + + vb0 = VMERGE_VVM_FLOAT(va20, va10, vbool, vl); + vb1 = VMERGE_VVM_FLOAT(va21, va11, vbool, vl); + + vbx2 = VSET_VX2(vbx2, 0, vb0); + vbx2 = VSET_VX2(vbx2, 1, vb1); + VSSEG2_FLOAT(b, vbx2, vl); + + b += vl * 2; + ao1 += lda * 2; + ao2 += 2; + } + } + + return 0; +} + diff --git a/kernel/riscv64/zsymm_ucopy_rvv_v1.c b/kernel/riscv64/zsymm_ucopy_rvv_v1.c new file mode 100644 index 000000000..069551bb0 --- /dev/null +++ b/kernel/riscv64/zsymm_ucopy_rvv_v1.c @@ -0,0 +1,121 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define INT_V_T vint32m2_t +#define VID_V_INT __riscv_vid_v_i32m2 +#define VADD_VX_INT __riscv_vadd_vx_i32m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define INT_V_T vint64m2_t +#define VID_V_INT __riscv_vid_v_i64m2 +#define VADD_VX_INT __riscv_vadd_vx_i64m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 +#endif + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) +{ + BLASLONG i, js, offset; + + FLOAT *ao1, *ao2; + + BLASLONG stride_lda = sizeof(FLOAT)*lda * 2; + + FLOAT_V_T vb0, vb1, va10, va11, va20, va21; + FLOAT_VX2_T va1x2, va2x2, vbx2; + VBOOL_T vbool; + INT_V_T vindex_max, vindex; + + + size_t vl = VSETVL_MAX; + vindex_max = VID_V_INT(vl); + + for (js = n; js > 0; js -= vl, posX += vl) { + vl = VSETVL(js); + offset = posX - posY; + + ao1 = a + posY * 2 + (posX + 0) * lda * 2; + ao2 = a + posX * 2 + 0 + posY * lda * 2; + + for (i = m; i > 0; i--, offset--) { + va1x2 = VLSSEG2_FLOAT(ao1, stride_lda, vl); + va2x2 = VLSEG2_FLOAT(ao2, vl); + + va20 = VGET_VX2(va2x2, 0); + va21 = VGET_VX2(va2x2, 1); + va10 = VGET_VX2(va1x2, 0); + va11 = VGET_VX2(va1x2, 1); + + vindex = VADD_VX_INT(vindex_max, offset, vl); + vbool = VMSGT_VX_INT(vindex, 0, vl); + + vb0 = VMERGE_VVM_FLOAT(va20, va10, vbool, vl); + vb1 = VMERGE_VVM_FLOAT(va21, va11, vbool, vl); + + vbx2 = VSET_VX2(vbx2, 0, vb0); + vbx2 = VSET_VX2(vbx2, 1, vb1); + VSSEG2_FLOAT(b, vbx2, vl); + + b += vl * 2; + ao1 += 2; + ao2 += lda * 2; + } + } + + return 0; +} diff --git a/kernel/riscv64/zsymv_L_rvv.c b/kernel/riscv64/zsymv_L_rvv.c new file mode 100644 index 000000000..cefdea7f6 --- /dev/null +++ b/kernel/riscv64/zsymv_L_rvv.c @@ -0,0 +1,179 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VSEV_FLOAT __riscv_vse32_v_f32m4 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VFNEGV_FLOAT __riscv_vfneg_v_f32mf4 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VSEV_FLOAT __riscv_vse64_v_f64m4 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VFNEGV_FLOAT __riscv_vfneg_v_f64mf4 +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j, k; + BLASLONG ix,iy; + BLASLONG jx,jy; + FLOAT temp1[2]; + FLOAT temp2[2]; + FLOAT *a_ptr = a; + BLASLONG gvl = VSETVL_MAX; + FLOAT_V_T_M1 v_res, v_z0; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + + FLOAT_V_T va_r, va_i, vx_r, vx_i, vy_r, vy_i, vr_r, vr_i; + BLASLONG stride_x, stride_y, inc_xv, inc_yv, len; + + stride_x = 2 * inc_x * sizeof(FLOAT); + stride_y = 2 * inc_y * sizeof(FLOAT); + jx = 0; + jy = 0; + for (j=0; j 0){ + gvl = VSETVL(len); + inc_xv = inc_x * gvl; + inc_yv = inc_y * gvl; + vr_r = VFMVVF_FLOAT(0, gvl); + vr_i = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < len / gvl; k++){ + va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl); + va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl); + + vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl); + vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl); + + vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl); + vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl); + + VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl); + VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl); + + vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl); + vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl); + vr_r = VFMACCVV_FLOAT(vr_r, vx_r, va_r, gvl); + vr_r = VFNMSACVV_FLOAT(vr_r, vx_i, va_i, gvl); + vr_i = VFMACCVV_FLOAT(vr_i, vx_r, va_i, gvl); + vr_i = VFMACCVV_FLOAT(vr_i, vx_i, va_r, gvl); + + i += gvl; + ix += inc_xv; + iy += inc_yv; + } + + if(i < m){ + unsigned int gvl_rem = VSETVL(m-i); + vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl_rem); + vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl_rem); + va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl_rem); + va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl_rem); + + vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl_rem); + vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl_rem); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl_rem); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl_rem); + + VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl_rem); + VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl_rem); + + vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl_rem); + vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl_rem); + vr_r = VFMACCVV_FLOAT_TU(vr_r, vx_r, va_r, gvl_rem); + vr_r = VFNMSACVV_FLOAT_TU(vr_r, vx_i, va_i, gvl_rem); + vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_r, va_i, gvl_rem); + vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_i, va_r, gvl_rem); + + } + v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); + temp2[0] = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); + temp2[1] = VFMVFS_FLOAT_M1(v_res); + } + y[2 * jy] += alpha_r * temp2[0] - alpha_i * temp2[1]; + y[2 * jy + 1] += alpha_r * temp2[1] + alpha_i * temp2[0]; + + jx += inc_x; + jy += inc_y; + a_ptr += 2 * lda; + } + + return(0); +} + diff --git a/kernel/riscv64/zsymv_U_rvv.c b/kernel/riscv64/zsymv_U_rvv.c new file mode 100644 index 000000000..67b5a649c --- /dev/null +++ b/kernel/riscv64/zsymv_U_rvv.c @@ -0,0 +1,177 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VSEV_FLOAT __riscv_vse32_v_f32m4 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VSEV_FLOAT __riscv_vse64_v_f64m4 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j, k; + BLASLONG ix,iy; + BLASLONG jx,jy; + FLOAT temp1[2]; + FLOAT temp2[2]; + FLOAT *a_ptr = a; + BLASLONG gvl = VSETVL_MAX; + FLOAT_V_T_M1 v_res, v_z0; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + + + FLOAT_V_T va_r, va_i, vx_r, vx_i, vy_r, vy_i, vr_r, vr_i; + BLASLONG stride_x, stride_y, inc_xv, inc_yv; + + BLASLONG m1 = m - offset; + jx = m1 * inc_x; + jy = m1 * inc_y; + a_ptr += m1 * lda; + stride_x = 2 * inc_x * sizeof(FLOAT); + stride_y = 2 * inc_y * sizeof(FLOAT); + for (j=m1; j 0){ + ix = 0; + iy = 0; + i = 0; + gvl = VSETVL(j); + inc_xv = inc_x * gvl; + inc_yv = inc_y * gvl; + vr_r = VFMVVF_FLOAT(0, gvl); + vr_i = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < j / gvl; k++){ + va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl); + va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl); + + vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl); + vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl); + + vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl); + vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl); + + VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl); + VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl); + + vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl); + vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl); + vr_r = VFMACCVV_FLOAT(vr_r, vx_r, va_r, gvl); + vr_r = VFNMSACVV_FLOAT(vr_r, vx_i, va_i, gvl); + vr_i = VFMACCVV_FLOAT(vr_i, vx_r, va_i, gvl); + vr_i = VFMACCVV_FLOAT(vr_i, vx_i, va_r, gvl); + + i += gvl; + ix += inc_xv; + iy += inc_yv; + } + + if(i < j){ + unsigned int gvl_rem = VSETVL(j-i); + vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl_rem); + vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl_rem); + + va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl_rem); + va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl_rem); + + vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl_rem); + vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl_rem); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl_rem); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl_rem); + + VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl_rem); + VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl_rem); + + vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl_rem); + vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl_rem); + vr_r = VFMACCVV_FLOAT_TU(vr_r, vx_r, va_r, gvl_rem); + vr_r = VFNMSACVV_FLOAT_TU(vr_r, vx_i, va_i, gvl_rem); + vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_r, va_i, gvl_rem); + vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_i, va_r, gvl_rem); + + } + v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); + temp2[0] = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); + temp2[1] = VFMVFS_FLOAT_M1(v_res); + } + + y[2 * jy] += temp1[0] * a_ptr[j * 2] - temp1[1] * a_ptr[j * 2 + 1] + alpha_r * temp2[0] - alpha_i * temp2[1]; + y[2 * jy + 1] += temp1[1] * a_ptr[j * 2] + temp1[0] * a_ptr[j * 2 + 1] + alpha_r * temp2[1] + alpha_i * temp2[0]; + + a_ptr += 2 * lda; + jx += inc_x; + jy += inc_y; + } + + return(0); +} + diff --git a/kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c b/kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c new file mode 100644 index 000000000..d7d5e5fea --- /dev/null +++ b/kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c @@ -0,0 +1,805 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=2 + M=4 + M_tail_scalar_from=2 + N=4 + __riscv_='__riscv_' + complex=True + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='trmm' + param_precision='double' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=64 + ELEN_PARAM=64 + LMUL_ACC=2 + VFMACC='__riscv_vfmacc_vf_f64m2' + VFMUL='__riscv_vfmul_vf_f64m2' + VLEV='__riscv_vle64_v_f64m2' + VLSEV='__riscv_vlse64_v_f64m2' + VMACC_TO_ACC='__riscv_vfmacc_vf_f64m2' + VMUL_TO_ACC='__riscv_vfmul_vf_f64m2' + VSETVL='__riscv_vsetvl_e64m2' + VSEV='__riscv_vse64_v_f64m2' + VSSEV='__riscv_vsse64_v_f64m2' + acc_vector_t='vfloat64m2_t' + output='ztrmm_kernel_4x4_zvl128b.c' + param_scalar_t='double' + param_vector_t='vfloat64m2_t' + +*/ + +#include "common.h" + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define S0 1 +#define S1 -1 +#define S2 1 +#define S3 1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfmacc +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define S0 1 +#define S1 1 +#define S2 1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfmsac +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define S0 1 +#define S1 1 +#define S2 -1 +#define S3 1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfnmsac +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define S0 1 +#define S1 -1 +#define S2 -1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfnmacc +#endif + +#if defined(LEFT) != defined(TRANSA) +#define BACKWARDS +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 4; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e64m2(4); + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 4; +#endif +#endif + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + double B1r = B[bi + 1 * 2 + 0]; + double B1i = B[bi + 1 * 2 + 1]; + double B2r = B[bi + 2 * 2 + 0]; + double B2i = B[bi + 2 * 2 + 1]; + double B3r = B[bi + 3 * 2 + 0]; + double B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k + // leaving 6 vector registers for temporaries + // performing 2 operations between reuses of temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + vfloat64m2_t ACC1r = tmp1r; + vfloat64m2_t ACC1i = tmp1i; + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + vfloat64m2_t ACC2r = tmp0r; + vfloat64m2_t ACC2i = tmp0i; + vfloat64m2_t ACC3r = tmp1r; + vfloat64m2_t ACC3i = tmp1i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + B2r = B[bi + 2 * 2 + 0]; + B2i = B[bi + 2 * 2 + 1]; + B3r = B[bi + 3 * 2 + 0]; + B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); + ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); + ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); + ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat64m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + vfloat64m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); + vfloat64m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); + vfloat64m2_t C2r = __riscv_vfmul(ACC2r, alphar, gvl); + vfloat64m2_t C2i = __riscv_vfmul(ACC2i, alphar, gvl); + vfloat64m2_t C3r = __riscv_vfmul(ACC3r, alphar, gvl); + vfloat64m2_t C3i = __riscv_vfmul(ACC3i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); + C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); + C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); + C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); + + m_top += 4; + } + + // -- tails for main pass + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + double result8 = 0; + double result9 = 0; + double result10 = 0; + double result11 = 0; + double result12 = 0; + double result13 = 0; + double result14 = 0; + double result15 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1]; + result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1]; + result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1]; + result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1]; + ai += 2 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = result4 * alphar; + Ci = result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = result6 * alphar; + Ci = result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + Cr = result8 * alphar; + Ci = result9 * alphar; + Cr -= result9 * alphai; + Ci += result8 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = result10 * alphar; + Ci = result11 * alphar; + Cr -= result11 * alphai; + Ci += result10 * alphai; + C[(ci + 2 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 1) * 2 + 1] = Ci; + Cr = result12 * alphar; + Ci = result13 * alphar; + Cr -= result13 * alphai; + Ci += result12 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + Cr = result14 * alphar; + Ci = result15 * alphar; + Cr -= result15 * alphai; + Ci += result14 * alphai; + C[(ci + 3 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + ai += 1 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = result4 * alphar; + Ci = result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = result6 * alphar; + Ci = result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e64m2(4); + m_top = 0; + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 2; +#endif +#endif + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + double B1r = B[bi + 1 * 2 + 0]; + double B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k + // leaving 10 vector registers for temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + vfloat64m2_t ACC1r = tmp1r; + vfloat64m2_t ACC1i = tmp1i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat64m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + vfloat64m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); + vfloat64m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + ai += 2 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = result4 * alphar; + Ci = result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = result6 * alphar; + Ci = result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + ai += 1 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e64m2(4); + m_top = 0; + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 1; +#endif +#endif + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k + // leaving 12 vector registers for temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat64m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + ai += 2 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + ai += 1 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/ztrmm_kernel_8x4_zvl256b.c b/kernel/riscv64/ztrmm_kernel_8x4_zvl256b.c new file mode 100644 index 000000000..de7622b89 --- /dev/null +++ b/kernel/riscv64/ztrmm_kernel_8x4_zvl256b.c @@ -0,0 +1,1337 @@ +/* + +AUTOGENERATED KERNEL +Settings: + LMUL=1 + M=8 + M_tail_scalar_from=1 + N=4 + __riscv_='__riscv_' + complex=True + conjugate=False + cpu='zvl256b' + force_acc_double=False + index_type='BLASLONG' + op='trmm' + param_precision='double' + reg_width_bits=256 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=64 + ELEN_PARAM=64 + LMUL_ACC=1 + VFMACC='__riscv_vfmacc_vf_f64m1' + VFMUL='__riscv_vfmul_vf_f64m1' + VLEV='__riscv_vle64_v_f64m1' + VLSEV='__riscv_vlse64_v_f64m1' + VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1' + VMUL_TO_ACC='__riscv_vfmul_vf_f64m1' + VSETVL='__riscv_vsetvl_e64m1' + VSEV='__riscv_vse64_v_f64m1' + VSSEV='__riscv_vsse64_v_f64m1' + acc_vector_t='vfloat64m1_t' + output='ztrmm_kernel_8x4_zvl256b.c' + param_scalar_t='double' + param_vector_t='vfloat64m1_t' + +*/ + +#include "common.h" + + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + #define S0 1 + #define S1 -1 + #define S2 1 + #define S3 1 + #define VFMACC_RR __riscv_vfmsac + #define VFMACC_RI __riscv_vfmacc +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + #define S0 1 + #define S1 1 + #define S2 1 + #define S3 -1 + #define VFMACC_RR __riscv_vfmacc + #define VFMACC_RI __riscv_vfmsac +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + #define S0 1 + #define S1 1 + #define S2 -1 + #define S3 1 + #define VFMACC_RR __riscv_vfmacc + #define VFMACC_RI __riscv_vfnmsac +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + #define S0 1 + #define S1 -1 + #define S2 -1 + #define S3 -1 + #define VFMACC_RR __riscv_vfmsac + #define VFMACC_RI __riscv_vfnmacc +#endif + + +#if defined(LEFT) != defined(TRANSA) + #define BACKWARDS +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc, BLASLONG offset) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + + // -- MAIN PASS + + for (BLASLONG j=0; j +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VBOOL_T vbool16_t +#define UINT_V_T vint32m2_t +#define VID_V_UINT __riscv_vid_v_i32m2 +#define VMSGTU_VX_UINT __riscv_vmsgt_vx_i32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_i32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, X; + + FLOAT *ao; + + BLASLONG stride_lda = sizeof(FLOAT)*lda*2; + + FLOAT_VX2_T vax2; + FLOAT_V_T va0, va1; + + size_t vl; +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + X = posX; + + if (posX <= posY) + { + ao = a + posY * 2 + posX * lda * 2; + } + else + { + ao = a + posX * 2 + posY * lda * 2; + } + + i = 0; + do + { + if (X > posY) + { + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + VSSEG2_FLOAT(b, vax2, vl); + + ao += 2; + b += vl * 2; + + X ++; + i ++; + } + else if (X < posY) + { + ao += lda * 2; + b += vl * 2; + X ++; + i ++; + } + else + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); +#endif + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(b, vax2, vl); + ao += 2; + b += vl * 2; + } + + X += vl; + i += vl; + } + } while (i < m); + + posY += vl; + } + + return 0; +} diff --git a/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c new file mode 100644 index 000000000..ab8d34337 --- /dev/null +++ b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c @@ -0,0 +1,152 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, X; + + FLOAT *ao; + + FLOAT_VX2_T vax2; + FLOAT_V_T va0, va1; + size_t vl; +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + X = posX; + + if (posX <= posY) + { + ao = a + posY * 2 + posX * lda * 2; + } + else + { + ao = a + posX * 2 + posY * lda * 2; + } + + i = 0; + do + { + if (X > posY) + { + ao += 2; + b += vl * 2; + X++; + i++; + } + else if (X < posY) + { + //va1 = VLEV_FLOAT(ao, vl); + vax2 = VLSEG2_FLOAT(ao, vl); + VSSEG2_FLOAT(b, vax2, vl); + + ao += lda * 2; + b += vl * 2; + X ++; + i ++; + } + else + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + //va1 = VLEV_FLOAT(ao, vl); + vax2 = VLSEG2_FLOAT(ao, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); +#endif + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(b, vax2, vl); + ao += lda * 2; + b += vl * 2; + } + X += vl; + i += vl; + + } + } while (i < m); + + posY += vl; + } + + return 0; +} + diff --git a/kernel/riscv64/ztrmm_uncopy_rvv_v1.c b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c new file mode 100644 index 000000000..ba6e63b96 --- /dev/null +++ b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c @@ -0,0 +1,154 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, X; + BLASLONG stride_lda = sizeof(FLOAT) * lda * 2; + FLOAT *ao; + + FLOAT_VX2_T vax2; + FLOAT_V_T va0, va1; + size_t vl; + +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + X = posX; + + if (posX <= posY) + { + ao = a + posX * 2 + posY * lda * 2; + } + else + { + ao = a + posY * 2 + posX * lda * 2; + } + + i = 0; + do + { + if (X < posY) + { + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + VSSEG2_FLOAT(b, vax2, vl); + + ao += 2; + b += vl * 2; + + X++; + i++; + } + else if (X > posY) + { + ao += lda * 2; + b += vl * 2; + + X++; + i++; + } + else + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); +#endif + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(b, vax2, vl); + ao += 2; + b += vl * 2; + } + + X += vl; + i += vl; + } + }while (i < m); + + posY += vl; + } + + return 0; +} diff --git a/kernel/riscv64/ztrmm_utcopy_rvv_v1.c b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c new file mode 100644 index 000000000..a624fff54 --- /dev/null +++ b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c @@ -0,0 +1,151 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, j, js, X; + + FLOAT *ao; + + FLOAT_VX2_T vax2; + FLOAT_V_T va0, va1; +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + + X = posX; + + if (posX <= posY) + { + ao = a + posX * 2 + posY * lda * 2; + } + else + { + ao = a + posY * 2 + posX * lda * 2; + } + + i = 0; + do + { + if (X < posY) + { + ao += 2; + b += vl * 2; + X++; + i++; + } + else if (X > posY) + { + vax2 = VLSEG2_FLOAT(ao, vl); + VSSEG2_FLOAT(b, vax2, vl); + ao += lda * 2; + b += vl * 2; + X++; + i++; + } + else + { + vindex = VID_V_UINT(vl); + for (j = 0; j < vl; j++) + { + vax2 = VLSEG2_FLOAT(ao, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); +#endif + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(b, vax2, vl); + ao += lda * 2; + b += vl * 2; + } + X += vl; + i += vl; + } + }while (i < m); + posY += vl; + } + + return 0; +} + diff --git a/kernel/riscv64/ztrmmkernel_2x2_rvv.c b/kernel/riscv64/ztrmmkernel_2x2_rvv.c new file mode 100644 index 000000000..399124d2e --- /dev/null +++ b/kernel/riscv64/ztrmmkernel_2x2_rvv.c @@ -0,0 +1,596 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VLSEG4_FLOAT __riscv_vlseg4e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m2 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m2 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m2_f32m1 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VLSEG4_FLOAT __riscv_vlseg4e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m2 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m2 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m2_f64m1 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#endif + +// Optimizes the implementation in ../generic/ztrmmkernel_2x2.c + + +/******************************** + ADD1 a*c + ADD2 b*c + ADD3 a*d + ADD4 b*d + *********************************/ +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb, + FLOAT* C,BLASLONG ldc, BLASLONG offset) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*ptrba,*ptrbb; + FLOAT res0,res1; + BLASLONG off, temp; + + FLOAT_V_T va0, va1, va2, va3, vb0, vb1, vb2, vb3; + FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; + FLOAT_V_T_M1 v_m1_res0, v_m1_res1; + FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + + size_t vl; + size_t vlmax = VSETVL_MAX; + +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#else + off = 0; +#endif + + for (j = bn/2; j > 0; j--) + { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + C0 = C; + C1 = C0+2*ldc; + ptrba = ba; + + for (i = bm/2; i > 0; i--) + { +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2*2; + ptrbb = bb+off*2*2; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + vres4 = VFMVVF_FLOAT(0.0, vlmax); + vres5 = VFMVVF_FLOAT(0.0, vlmax); + vres6 = VFMVVF_FLOAT(0.0, vlmax); + vres7 = VFMVVF_FLOAT(0.0, vlmax); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk - off; +#elif defined(LEFT) + temp = off + 2; +#else + temp = off + 2; +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); + vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); + vres4 = VFNMSACVV_FLOAT(vres4, va1, vb3, vl); + vres5 = VFMACCVV_FLOAT(vres5, va0, vb3, vl); + + vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl); + vres7 = VFMACCVV_FLOAT(vres7, va3, vb2, vl); + vres6 = VFNMSACVV_FLOAT(vres6, va3, vb3, vl); + vres7 = VFMACCVV_FLOAT(vres7, va2, vb3, vl); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); + vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); + vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl); + vres5 = VFNMSACVV_FLOAT(vres5, va0, vb3, vl); + + vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl); + vres7 = VFMACCVV_FLOAT(vres7, va3, vb2, vl); + vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl); + vres7 = VFNMSACVV_FLOAT(vres7, va2, vb3, vl); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); + vres5 = VFNMSACVV_FLOAT(vres5, va1, vb2, vl); + vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl); + vres5 = VFMACCVV_FLOAT(vres5, va0, vb3, vl); + + vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl); + vres7 = VFNMSACVV_FLOAT(vres7, va3, vb2, vl); + vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl); + vres7 = VFMACCVV_FLOAT(vres7, va2, vb3, vl); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); + vres5 = VFNMSACVV_FLOAT(vres5, va1, vb2, vl); + vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl); + vres5 = VFNMSACVV_FLOAT(vres5, va0, vb3, vl); + + vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl); + vres7 = VFNMSACVV_FLOAT(vres7, va3, vb2, vl); + vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl); + vres7 = VFNMSACVV_FLOAT(vres7, va2, vb3, vl); + +#endif + ptrba += vl * 4; + ptrbb += vl * 4; + } + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C0[0] = res0 * alphar - res1 * alphai; + C0[1] = res1 * alphar + res0 * alphai; + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C0[2] = res0 * alphar - res1 * alphai; + C0[3] = res1 * alphar + res0 * alphai; + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres4, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres5, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C1[0] = res0 * alphar - res1 * alphai; + C1[1] = res1 * alphar + res0 * alphai; + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres6, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres7, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C1[2] = res0 * alphar - res1 * alphai; + C1[3] = res1 * alphar + res0 * alphai; +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 2; +#else + temp -= 2; +#endif + + ptrba += temp*2*2; + ptrbb += temp*2*2; + +#endif + +#ifdef LEFT + off += 2; +#endif + + C0 = C0+4; + C1 = C1+4; + } + + if (bm & 1) + { +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2; + ptrbb = bb + off*2*2; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk - off; +#elif defined(LEFT) + temp = off+1; +#else + temp = off+2; +#endif + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); + vres3 = VFMACCVV_FLOAT(vres3, va1, vb2, vl); + vres2 = VFNMSACVV_FLOAT(vres2, va1, vb3, vl); + vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); + vres3 = VFMACCVV_FLOAT(vres3, va1, vb2, vl); + vres2 = VFMACCVV_FLOAT(vres2, va1, vb3, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va0, vb3, vl); + +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va1, vb2, vl); + vres2 = VFMACCVV_FLOAT(vres2, va1, vb3, vl); + vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl); + +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va1, vb2, vl); + vres2 = VFNMSACVV_FLOAT(vres2, va1, vb3, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va0, vb3, vl); + +#endif + ptrba += vl * 2; + ptrbb += vl * 4; + } + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C0[0] = res0 * alphar - res1 * alphai; + C0[1] = res1 * alphar + res0 * alphai; + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C1[0] = res0 * alphar - res1 * alphai; + C1[1] = res1 * alphar + res0 * alphai; + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 1; +#else + temp -= 2; +#endif + ptrba += temp*2; + ptrbb += temp*2*2; +#endif +#ifdef LEFT + off += 1; +#endif + C0 = C0+2; + C1 = C1+2; + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; +#endif + k = (bk<<2); + bb = bb+k; + i = (ldc<<2); + C = C+i; + } + + if (bn & 1) + { + C0 = C; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + + for (i = bm/2; i > 0; i--) + { +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2*2; + ptrbb = bb+off*2; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk - off; +#elif defined(LEFT) + temp = off + 2; +#else + temp = off + 1; +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl); + +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl); + +#endif + ptrba += vl * 4; + ptrbb += vl * 2; + } + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C0[0] = res0 * alphar - res1 * alphai; + C0[1] = res1 * alphar + res0 * alphai; + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C0[2] = res0 * alphar - res1 * alphai; + C0[3] = res1 * alphar + res0 * alphai; + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + temp = bk-off; +#ifdef LEFT + temp -= 2; +#else + temp -= 1; +#endif + ptrba += temp*2*2; + ptrbb += temp*2; +#endif +#ifdef LEFT + off += 2; +#endif + C0 = C0+4; + } + + if (bm & 1) + { +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2; + ptrbb = bb + off*2; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off + 1; +#else + temp = off + 1; +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + +#endif + ptrba += vl * 2; + ptrbb += vl * 2; + + } + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + + C0[0] = res0 * alphar - res1 * alphai; + C0[1] = res1 * alphar + res0 * alphai; + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 1; +#else + temp -= 1; +#endif + ptrba += temp*2; + ptrbb += temp*2; + +#endif +#ifdef LEFT + off += 1; +#endif + C0 = C0+2; + } + k = (bk<<1); + bb = bb+k; + i = (ldc<<1); + C = C+i; + } + return 0; +} diff --git a/kernel/riscv64/ztrmmkernel_rvv_v1x4.c b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c new file mode 100644 index 000000000..db5f06af8 --- /dev/null +++ b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c @@ -0,0 +1,632 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFMACCVF_FLOAT +#define OP_ii VFNMSACVF_FLOAT +#define OP_ri VFMACCVF_FLOAT +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFMACCVF_FLOAT +#define OP_ii VFMACCVF_FLOAT +#define OP_ri VFNMSACVF_FLOAT +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFNMSACVF_FLOAT +#define OP_ii VFMACCVF_FLOAT +#define OP_ri VFMACCVF_FLOAT +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFNMSACVF_FLOAT +#define OP_ii VFNMSACVF_FLOAT +#define OP_ri VFNMSACVF_FLOAT +#endif + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,FLOAT* C, BLASLONG ldc, BLASLONG offset) +{ + BLASLONG i,j,k; + FLOAT *C0, *C1, *C2, *C3, *ptrba,*ptrbb; + BLASLONG off, temp; + +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#else + off = 0; +#endif + + FLOAT_VX2_T vax2; + FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; + FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; + + //fprintf(stderr, "%s, bn=%ld bm=%ld bk=%ld alphar=%f alphai=%f ldc=%ld, offset=%ld\n", __FUNCTION__, bn, bm, bk, alphar, alphai, ldc, offset); // Debug + + size_t vl; + for (j = bn/4; j > 0; j--) + { + C0 = C; + C1 = C0 + 2 * ldc; + C2 = C1 + 2 * ldc; + C3 = C2 + 2 * ldc; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl*2; + ptrbb = bb + off*4*2; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + vres4 = VFMVVF_FLOAT(0.0, vl); + vres5 = VFMVVF_FLOAT(0.0, vl); + vres6 = VFMVVF_FLOAT(0.0, vl); + vres7 = VFMVVF_FLOAT(0.0, vl); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+4; // number of values in B +#endif + + for (k = temp/4; k > 0; k--) + { + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va0, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va1, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va1, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va0, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va0, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va1, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va1, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va0, vl); + + ptrbb += 8; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va2, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va3, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va3, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va2, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va2, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va3, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va3, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va2, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va2, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va3, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va3, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va2, vl); + + ptrbb += 8; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va4, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va5, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va5, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va4, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va4, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va5, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va5, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va4, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va4, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va5, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va5, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va4, vl); + + ptrbb += 8; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va6, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va7, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va7, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va6, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va6, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va7, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va7, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va6, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va6, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va7, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va7, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va6, vl); + + ptrbb += 8; + } + + for (k = temp & 3; k > 0; k--) + { + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va0, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va1, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va1, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va0, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va0, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va1, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va1, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va0, vl); + + ptrbb += 8; + } + va0 = VFMULVF_FLOAT(vres0, alphar, vl); + va1 = VFMULVF_FLOAT(vres1, alphar, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); + + va2 = VFMULVF_FLOAT(vres2, alphar, vl); + va3 = VFMULVF_FLOAT(vres3, alphar, vl); + va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); + va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); + + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C1, vax2, vl); + + va0 = VFMULVF_FLOAT(vres4, alphar, vl); + va1 = VFMULVF_FLOAT(vres5, alphar, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres5, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres4, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C2, vax2, vl); + + va2 = VFMULVF_FLOAT(vres6, alphar, vl); + va3 = VFMULVF_FLOAT(vres7, alphar, vl); + va2 = VFNMSACVF_FLOAT(va2, alphai, vres7, vl); + va3 = VFMACCVF_FLOAT(va3, alphai, vres6, vl); + + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C3, vax2, vl); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 4; // number of values in B +#endif + ptrba += temp*vl*2; + ptrbb += temp*4*2; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + + C0 += vl * 2; + C1 += vl * 2; + C2 += vl * 2; + C3 += vl * 2; + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; +#endif + + bb += (bk << 3); + C += (ldc << 3); + } + + if (bn & 2) + { + C0 = C; + C1 = C0 + 2 * ldc; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl*2; + ptrbb = bb + off*2*2; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+2; // number of values in B +#endif + for (k = temp/4; k > 0; k--) + { + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + ptrbb += 4; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va2, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va3, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va3, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va2, vl); + + ptrbb += 4; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va4, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va5, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va5, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va4, vl); + + ptrbb += 4; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va6, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va7, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va7, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va6, vl); + + ptrbb += 4; + } + + for (k = temp & 3; k > 0; k--) + { + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + ptrbb += 4; + } + + va0 = VFMULVF_FLOAT(vres0, alphar, vl); + va1 = VFMULVF_FLOAT(vres1, alphar, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); + + va2 = VFMULVF_FLOAT(vres2, alphar, vl); + va3 = VFMULVF_FLOAT(vres3, alphar, vl); + va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); + va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); + + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C1, vax2, vl); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 2; // number of values in B +#endif + ptrba += temp*vl*2; + ptrbb += temp*2*2; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + C0 += vl * 2; + C1 += vl * 2; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; +#endif + bb += (bk << 2); + C += (ldc << 2); + } + + if (bn & 1) + { + C0 = C; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl*2; + ptrbb = bb + off*2; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+1; // number of values in B +#endif + for (k = temp/4; k > 0; k--) + { + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + ptrbb += 2; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl); + + ptrbb += 2; + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl); + + ptrbb += 2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl); + + ptrbb += 2; + } + + for (k = temp & 3; k > 0; k--) + { + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + ptrbb += 2; + } + + va0 = VFMULVF_FLOAT(vres0, alphar, vl); + va1 = VFMULVF_FLOAT(vres1, alphar, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 1; // number of values in B +#endif + ptrba += temp*vl*2; + ptrbb += temp*2; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + C0 += vl * 2; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; +#endif + bb += bk << 1; + C += ldc << 1; + } + return 0; +} diff --git a/kernel/riscv64/ztrsm_lncopy_rvv_v1.c b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c new file mode 100644 index 000000000..36cec711d --- /dev/null +++ b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c @@ -0,0 +1,115 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2x2_m +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2x2_m +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug + + BLASLONG i, ii, jj, js; + + FLOAT *ao; + + jj = offset; + + BLASLONG stride_lda = sizeof(FLOAT)*lda*2; + + FLOAT_VX2_T vax2; + VBOOL_T vbool_cmp; + UINT_V_T vindex; + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + ii = 0; + for (i = 0; i < m;) + { + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl); + + compinv((b + j * 2), *(ao + j * lda * 2), *(ao + j * lda * 2 + 1)); + ao += 2; + b += vl * 2; + } + i += vl; + ii += vl; + } + else + { + if (ii > jj) + { + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + VSSEG2_FLOAT(b, vax2, vl); + } + ao += 2; + b += vl * 2; + i++; + ii++; + } + } + + a += vl * lda * 2; + jj += vl; + } + + return 0; +} diff --git a/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c new file mode 100644 index 000000000..3a7bdb522 --- /dev/null +++ b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c @@ -0,0 +1,114 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2x2_m +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2x2_m +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug + + BLASLONG i, ii, jj, js; + + FLOAT *ao; + + jj = offset; + + FLOAT_VX2_T vax2; + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + ii = 0; + for (i = 0; i < m;) + { + + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + compinv((b + j * 2), *(ao + j * 2), *(ao + j * 2 + 1)); + + vax2 = VLSEG2_FLOAT(ao, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl); + + b += vl * 2; + ao += lda * 2; + } + i += vl; + ii += vl; + } + else + { + if (ii < jj) + { + vax2 = VLSEG2_FLOAT(ao, vl); + VSSEG2_FLOAT(b, vax2, vl); + } + ao += lda * 2; + b += vl * 2; + i ++; + ii ++; + } + } + + a += vl * 2; + jj += vl; + } + return 0; +} + diff --git a/kernel/riscv64/ztrsm_uncopy_rvv_v1.c b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c new file mode 100644 index 000000000..2a158d4de --- /dev/null +++ b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c @@ -0,0 +1,113 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2x2_m +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2x2_m +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 +#endif + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug + + BLASLONG i, ii, jj, js; + BLASLONG stride_lda = sizeof(FLOAT)*lda*2; + + FLOAT *ao; + jj = offset; + + FLOAT_VX2_T vax2; + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + i = 0; + ii = 0; + for (i = 0; i < m;) + { + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + compinv((b + j * 2), *(ao + j * lda * 2), *(ao + j * lda * 2 + 1)); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl); + ao += 2; + b += vl * 2; + } + i += vl; + ii += vl; + } + else + { + if (ii < jj) + { + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + VSSEG2_FLOAT(b, vax2, vl); + } + ao += 2; + b += vl * 2; + i++; + ii++; + } + } + + a += vl * lda * 2; + jj += vl; + } + return 0; +} diff --git a/kernel/riscv64/ztrsm_utcopy_rvv_v1.c b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c new file mode 100644 index 000000000..4b3319588 --- /dev/null +++ b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c @@ -0,0 +1,115 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2x2_m +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 +#else +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2x2_m +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 +#endif + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug + + BLASLONG i, ii, jj, js; + + FLOAT *ao; + + jj = offset; + FLOAT_VX2_T vax2; + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + ii = 0; + for (i = 0; i < m;) + { + + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + vax2 = VLSEG2_FLOAT(ao, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl); + + compinv((b + j * 2), *(ao + j * 2), *(ao + j * 2 + 1)); + + ao += lda * 2; + b += vl * 2; + } + i += vl; + ii += vl; + } + else + { + if (ii > jj) + { + vax2 = VLSEG2_FLOAT(ao, vl); + VSSEG2_FLOAT(b, vax2, vl); + } + ao += lda * 2; + b += vl * 2; + i ++; + ii ++; + } + } + + a += vl * 2; + jj += vl; + } + + return 0; +} diff --git a/kernel/x86/zscal.S b/kernel/x86/zscal.S index 1eb518563..adb56edb9 100644 --- a/kernel/x86/zscal.S +++ b/kernel/x86/zscal.S @@ -98,7 +98,7 @@ fcomip %st(1), %st ffreep %st(0) jne .L30 - +jp .L30 EMMS pxor %mm0, %mm0 diff --git a/kernel/x86/zscal_sse.S b/kernel/x86/zscal_sse.S index e011c98f5..89e36251e 100644 --- a/kernel/x86/zscal_sse.S +++ b/kernel/x86/zscal_sse.S @@ -87,6 +87,7 @@ xorps %xmm7, %xmm7 comiss %xmm0, %xmm7 jne .L100 # Alpha_r != ZERO + jp .L100 # Alpha_r NaN comiss %xmm1, %xmm7 jne .L100 # Alpha_i != ZERO diff --git a/kernel/x86/zscal_sse2.S b/kernel/x86/zscal_sse2.S index cc7ab6686..0bc61b209 100644 --- a/kernel/x86/zscal_sse2.S +++ b/kernel/x86/zscal_sse2.S @@ -98,6 +98,7 @@ xorps %xmm7, %xmm7 comisd %xmm0, %xmm7 jne .L100 + jp .L100 comisd %xmm1, %xmm7 jne .L100 diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index bea7036c2..f8278c3b4 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -405,7 +405,7 @@ DGEMVNKERNEL = dgemv_n.S endif ifndef DGEMVTKERNEL -DGEMVTKERNEL = dgemv_t.S +DGEMVTKERNEL = dgemv_t_4.c endif ifndef CGEMVNKERNEL diff --git a/kernel/x86_64/KERNEL.COOPERLAKE b/kernel/x86_64/KERNEL.COOPERLAKE index dba94aea8..22b042029 100644 --- a/kernel/x86_64/KERNEL.COOPERLAKE +++ b/kernel/x86_64/KERNEL.COOPERLAKE @@ -1,5 +1,5 @@ include $(KERNELDIR)/KERNEL.SKYLAKEX - +ifneq ($(NO_AVX512BF16), 1) SBGEMM_SMALL_M_PERMIT = sbgemm_small_kernel_permit_cooperlake.c SBGEMM_SMALL_K_NN = sbgemm_small_kernel_nn_cooperlake.c SBGEMM_SMALL_K_B0_NN = sbgemm_small_kernel_nn_cooperlake.c @@ -20,3 +20,4 @@ SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif diff --git a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS index 3a832e917..0ab2b4ddc 100644 --- a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS +++ b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS @@ -1,5 +1,6 @@ include $(KERNELDIR)/KERNEL.COOPERLAKE +ifneq ($(NO_AVX512BF16), 1) SBGEMM_SMALL_M_PERMIT = SBGEMM_SMALL_K_NN = SBGEMM_SMALL_K_B0_NN = @@ -20,3 +21,4 @@ SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c index 45e3531b8..bc79c0caf 100644 --- a/kernel/x86_64/zscal.c +++ b/kernel/x86_64/zscal.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #include "common.h" - +#include #if defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "zscal_microk_skylakex-2.c" @@ -69,16 +69,16 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha , FLOAT *x ) for( i=0; i FLT_MAX) { + while(j < n) + { + x[i]= NAN; + x[i+1] = da_r; + i += 2 ; + j++; + + } + + } else { while(j < n) { - temp0 = -da_i * x[i+1]; + if (x[i] < -FLT_MAX || x[i] > FLT_MAX) + temp0 = NAN; x[i+1] = da_i * x[i]; - x[i] = temp0; + if ( x[i] == x[i]) //preserve NaN + x[i] = temp0; i += 2 ; j++; @@ -409,28 +425,24 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, } else { - - if ( da_i == 0.0 ) + if (da_i == 0.0) { + while(j < n) + { - while(j < n) - { - temp0 = da_r * x[i]; x[i+1] = da_r * x[i+1]; x[i] = temp0; i += 2 ; j++; - } - + } } else { while(j < n) { - temp0 = da_r * x[i] - da_i * x[i+1]; x[i+1] = da_r * x[i+1] + da_i * x[i]; x[i] = temp0; @@ -439,11 +451,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, } - } + } } return(0); } - - diff --git a/kernel/x86_64/zscal_sse2.S b/kernel/x86_64/zscal_sse2.S index 223b1e439..d6a49136d 100644 --- a/kernel/x86_64/zscal_sse2.S +++ b/kernel/x86_64/zscal_sse2.S @@ -82,6 +82,7 @@ pxor %xmm15, %xmm15 comisd %xmm0, %xmm15 jne .L100 + jp .L100 comisd %xmm1, %xmm15 jne .L100 diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index d39b8447e..4160a1a76 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -233,9 +233,15 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, while (j < n1) { - temp0 = -da_i * x[i + 1]; + if (isnan(x[i]) || isinf(x[i])) + temp0 = NAN; + else + temp0 = -da_i * x[i + 1]; x[i + 1] = da_i * x[i]; x[i] = temp0; + if (isnan(x[i + inc_x]) || isinf(x[i + inc_x])) + temp1 = NAN; + else temp1 = -da_i * x[i + 1 + inc_x]; x[i + 1 + inc_x] = da_i * x[i + inc_x]; x[i + inc_x] = temp1; @@ -246,7 +252,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, while (j < n) { - temp0 = -da_i * x[i + 1]; + if (isnan(x[i]) || isinf(x[i])) + temp0 = NAN; + else + temp0 = -da_i * x[i + 1]; x[i + 1] = da_i * x[i]; x[i] = temp0; i += inc_x; @@ -320,7 +329,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if (da_i == 0) zscal_kernel_8_zero(n1, x); else - zscal_kernel_8_zero_r(n1, alpha, x); + zscal_kernel_8(n1, da_r, da_i, x); else if (da_i == 0) zscal_kernel_8_zero_i(n1, alpha, x); else @@ -347,7 +356,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, while (j < n) { - temp0 = -da_i * x[i + 1]; + if (isnan(x[i]) || isinf(x[i])) + temp0 = NAN; + else + temp0 = -da_i * x[i + 1]; x[i + 1] = da_i * x[i]; x[i] = temp0; i += 2; diff --git a/lapack-netlib/LAPACKE/include/lapacke_config.h b/lapack-netlib/LAPACKE/include/lapacke_config.h index c64fc4416..798a5eb2e 100644 --- a/lapack-netlib/LAPACKE/include/lapacke_config.h +++ b/lapack-netlib/LAPACKE/include/lapacke_config.h @@ -68,6 +68,17 @@ extern "C" { #endif #ifndef LAPACK_COMPLEX_CUSTOM +#if defined(_MSC_VER) + #define _CRT_USE_C_COMPLEX_H + #include + #define LAPACK_COMPLEX_CUSTOM + #define lapack_complex_float _Fcomplex + #define lapack_complex_double _Dcomplex + #define lapack_complex_float_real(z) (creal(z)) + #define lapack_complex_float_imag(z) (cimag(z)) + #define lapack_complex_double_real(z) (creal(z)) + #define lapack_complex_double_imag(z) (cimag(z)) +#else #if defined(LAPACK_COMPLEX_STRUCTURE) @@ -109,6 +120,7 @@ typedef struct { double real, imag; } _lapack_complex_double; #define lapack_complex_double_real(z) (creal(z)) #define lapack_complex_double_imag(z) (cimag(z)) +#endif #endif lapack_complex_float lapack_make_complex_float( float re, float im ); diff --git a/lapack-netlib/SRC/cbbcsd.f b/lapack-netlib/SRC/cbbcsd.f index 1cd1ffbf1..4d0c45efe 100644 --- a/lapack-netlib/SRC/cbbcsd.f +++ b/lapack-netlib/SRC/cbbcsd.f @@ -805,7 +805,7 @@ CALL SLARTGP( B22BULGE, B22E(I-1), RWORK(IU2SN+I-1), $ RWORK(IU2CS+I-1), R ) ELSE IF( NU .LT. MU ) THEN - CALL SLARTGS( B21E(I), B21E(I+1), NU, RWORK(IU2CS+I-1), + CALL SLARTGS( B21E(I), B21D(I+1), NU, RWORK(IU2CS+I-1), $ RWORK(IU2SN+I-1) ) ELSE CALL SLARTGS( B22D(I), B22E(I), MU, RWORK(IU2CS+I-1), diff --git a/lapack-netlib/SRC/cgebrd.f b/lapack-netlib/SRC/cgebrd.f index 5687161a5..5920b1cf5 100644 --- a/lapack-netlib/SRC/cgebrd.f +++ b/lapack-netlib/SRC/cgebrd.f @@ -123,7 +123,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= max(1,M,N). +*> The length of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(M,N), otherwise. *> For optimum performance LWORK >= (M+N)*NB, where NB *> is the optimal blocksize. *> @@ -148,7 +149,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEcomputational +*> \ingroup gebrd * *> \par Further Details: * ===================== @@ -225,8 +226,8 @@ * .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKOPT, MINMN, NB, - $ NBMIN, NX, WS + INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKMIN, LWKOPT, + $ MINMN, NB, NBMIN, NX, WS * .. * .. External Subroutines .. EXTERNAL CGEBD2, CGEMM, CLABRD, XERBLA @@ -236,16 +237,24 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 - NB = MAX( 1, ILAENV( 1, 'CGEBRD', ' ', M, N, -1, -1 ) ) - LWKOPT = ( M+N )*NB - WORK( 1 ) = REAL( LWKOPT ) + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = MAX( M, N ) + NB = MAX( 1, ILAENV( 1, 'CGEBRD', ' ', M, N, -1, -1 ) ) + LWKOPT = ( M+N )*NB + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -253,7 +262,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.LT.0 ) THEN @@ -265,7 +274,6 @@ * * Quick return if possible * - MINMN = MIN( M, N ) IF( MINMN.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -284,7 +292,7 @@ * Determine when to switch from blocked to unblocked code. * IF( NX.LT.MINMN ) THEN - WS = ( M+N )*NB + WS = LWKOPT IF( LWORK.LT.WS ) THEN * * Not enough work space for the optimal NB, consider using @@ -343,7 +351,7 @@ * CALL CGEBD2( M-I+1, N-I+1, A( I, I ), LDA, D( I ), E( I ), $ TAUQ( I ), TAUP( I ), WORK, IINFO ) - WORK( 1 ) = WS + WORK( 1 ) = SROUNDUP_LWORK( WS ) RETURN * * End of CGEBRD diff --git a/lapack-netlib/SRC/cgedmd.f90 b/lapack-netlib/SRC/cgedmd.f90 index 499489270..1413130ec 100644 --- a/lapack-netlib/SRC/cgedmd.f90 +++ b/lapack-netlib/SRC/cgedmd.f90 @@ -1,22 +1,526 @@ +!> \brief \b CGEDMD computes the Dynamic Mode Decomposition (DMD) for a pair of data snapshot matrices. +! +! =========== DOCUMENTATION =========== +! +! Definition: +! =========== +! +! SUBROUTINE CGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & +! M, N, X, LDX, Y, LDY, NRNK, TOL, & +! K, EIGS, Z, LDZ, RES, B, LDB, & +! W, LDW, S, LDS, ZWORK, LZWORK, & +! RWORK, LRWORK, IWORK, LIWORK, INFO ) +!..... +! USE iso_fortran_env +! IMPLICIT NONE +! INTEGER, PARAMETER :: WP = real32 +! +!..... +! Scalar arguments +! CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF +! INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & +! NRNK, LDZ, LDB, LDW, LDS, & +! LIWORK, LRWORK, LZWORK +! INTEGER, INTENT(OUT) :: K, INFO +! REAL(KIND=WP), INTENT(IN) :: TOL +! Array arguments +! COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) +! COMPLEX(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & +! W(LDW,*), S(LDS,*) +! COMPLEX(KIND=WP), INTENT(OUT) :: EIGS(*) +! COMPLEX(KIND=WP), INTENT(OUT) :: ZWORK(*) +! REAL(KIND=WP), INTENT(OUT) :: RES(*) +! REAL(KIND=WP), INTENT(OUT) :: RWORK(*) +! INTEGER, INTENT(OUT) :: IWORK(*) +! +!............................................................ +!> \par Purpose: +! ============= +!> \verbatim +!> CGEDMD computes the Dynamic Mode Decomposition (DMD) for +!> a pair of data snapshot matrices. For the input matrices +!> X and Y such that Y = A*X with an unaccessible matrix +!> A, CGEDMD computes a certain number of Ritz pairs of A using +!> the standard Rayleigh-Ritz extraction from a subspace of +!> range(X) that is determined using the leading left singular +!> vectors of X. Optionally, CGEDMD returns the residuals +!> of the computed Ritz pairs, the information needed for +!> a refinement of the Ritz vectors, or the eigenvectors of +!> the Exact DMD. +!> For further details see the references listed +!> below. For more details of the implementation see [3]. +!> \endverbatim +!............................................................ +!> \par References: +! ================ +!> \verbatim +!> [1] P. Schmid: Dynamic mode decomposition of numerical +!> and experimental data, +!> Journal of Fluid Mechanics 656, 5-28, 2010. +!> [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal +!> decompositions: analysis and enhancements, +!> SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. +!> [3] Z. Drmac: A LAPACK implementation of the Dynamic +!> Mode Decomposition I. Technical report. AIMDyn Inc. +!> and LAPACK Working Note 298. +!> [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. +!> Brunton, N. Kutz: On Dynamic Mode Decomposition: +!> Theory and Applications, Journal of Computational +!> Dynamics 1(2), 391 -421, 2014. +!> \endverbatim +!...................................................................... +!> \par Developed and supported by: +! ================================ +!> \verbatim +!> Developed and coded by Zlatko Drmac, Faculty of Science, +!> University of Zagreb; drmac@math.hr +!> In cooperation with +!> AIMdyn Inc., Santa Barbara, CA. +!> and supported by +!> - DARPA SBIR project "Koopman Operator-Based Forecasting +!> for Nonstationary Processes from Near-Term, Limited +!> Observational Data" Contract No: W31P4Q-21-C-0007 +!> - DARPA PAI project "Physics-Informed Machine Learning +!> Methodologies" Contract No: HR0011-18-9-0033 +!> - DARPA MoDyL project "A Data-Driven, Operator-Theoretic +!> Framework for Space-Time Analysis of Process Dynamics" +!> Contract No: HR0011-16-C-0116 +!> Any opinions, findings and conclusions or recommendations +!> expressed in this material are those of the author and +!> do not necessarily reflect the views of the DARPA SBIR +!> Program Office +!> \endverbatim +!...................................................................... +!> \par Distribution Statement A: +! ============================== +!> \verbatim +!> Approved for Public Release, Distribution Unlimited. +!> Cleared by DARPA on September 29, 2022 +!> \endverbatim +!...................................................................... +! Arguments +! ========= +! +!> \param[in] JOBS +!> \verbatim +!> JOBS (input) CHARACTER*1 +!> Determines whether the initial data snapshots are scaled +!> by a diagonal matrix. +!> 'S' :: The data snapshots matrices X and Y are multiplied +!> with a diagonal matrix D so that X*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'C' :: The snapshots are scaled as with the 'S' option. +!> If it is found that an i-th column of X is zero +!> vector and the corresponding i-th column of Y is +!> non-zero, then the i-th column of Y is set to +!> zero and a warning flag is raised. +!> 'Y' :: The data snapshots matrices X and Y are multiplied +!> by a diagonal matrix D so that Y*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'N' :: No data scaling. +!> \endverbatim +!..... +!> \param[in] JOBZ +!> \verbatim +!> JOBZ (input) CHARACTER*1 +!> Determines whether the eigenvectors (Koopman modes) will +!> be computed. +!> 'V' :: The eigenvectors (Koopman modes) will be computed +!> and returned in the matrix Z. +!> See the description of Z. +!> 'F' :: The eigenvectors (Koopman modes) will be returned +!> in factored form as the product X(:,1:K)*W, where X +!> contains a POD basis (leading left singular vectors +!> of the data matrix X) and W contains the eigenvectors +!> of the corresponding Rayleigh quotient. +!> See the descriptions of K, X, W, Z. +!> 'N' :: The eigenvectors are not computed. +!> \endverbatim +!..... +!> \param[in] JOBR +!> \verbatim +!> JOBR (input) CHARACTER*1 +!> Determines whether to compute the residuals. +!> 'R' :: The residuals for the computed eigenpairs will be +!> computed and stored in the array RES. +!> See the description of RES. +!> For this option to be legal, JOBZ must be 'V'. +!> 'N' :: The residuals are not computed. +!> \endverbatim +!..... +!> \param[in] JOBF +!> \verbatim +!> JOBF (input) CHARACTER*1 +!> Specifies whether to store information needed for post- +!> processing (e.g. computing refined Ritz vectors) +!> 'R' :: The matrix needed for the refinement of the Ritz +!> vectors is computed and stored in the array B. +!> See the description of B. +!> 'E' :: The unscaled eigenvectors of the Exact DMD are +!> computed and returned in the array B. See the +!> description of B. +!> 'N' :: No eigenvector refinement data is computed. +!> \endverbatim +!..... +!> \param[in] WHTSVD +!> \verbatim +!> WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } +!> Allows for a selection of the SVD algorithm from the +!> LAPACK library. +!> 1 :: CGESVD (the QR SVD algorithm) +!> 2 :: CGESDD (the Divide and Conquer algorithm; if enough +!> workspace available, this is the fastest option) +!> 3 :: CGESVDQ (the preconditioned QR SVD ; this and 4 +!> are the most accurate options) +!> 4 :: CGEJSV (the preconditioned Jacobi SVD; this and 3 +!> are the most accurate options) +!> For the four methods above, a significant difference in +!> the accuracy of small singular values is possible if +!> the snapshots vary in norm so that X is severely +!> ill-conditioned. If small (smaller than EPS*||X||) +!> singular values are of interest and JOBS=='N', then +!> the options (3, 4) give the most accurate results, where +!> the option 4 is slightly better and with stronger +!> theoretical background. +!> If JOBS=='S', i.e. the columns of X will be normalized, +!> then all methods give nearly equally accurate results. +!> \endverbatim +!..... +!> \param[in] M +!> \verbatim +!> M (input) INTEGER, M>= 0 +!> The state space dimension (the row dimension of X, Y). +!> \endverbatim +!..... +!> \param[in] N +!> \verbatim +!> N (input) INTEGER, 0 <= N <= M +!> The number of data snapshot pairs +!> (the number of columns of X and Y). +!> \endverbatim +!..... +!> \param[in,out] X +!> \verbatim +!> X (input/output) COMPLEX(KIND=WP) M-by-N array +!> > On entry, X contains the data snapshot matrix X. It is +!> assumed that the column norms of X are in the range of +!> the normalized floating point numbers. +!> < On exit, the leading K columns of X contain a POD basis, +!> i.e. the leading K left singular vectors of the input +!> data matrix X, U(:,1:K). All N columns of X contain all +!> left singular vectors of the input matrix X. +!> See the descriptions of K, Z and W. +!> \endverbatim +!..... +!> \param[in] LDX +!> \verbatim +!> LDX (input) INTEGER, LDX >= M +!> The leading dimension of the array X. +!> \endverbatim +!..... +!> \param[in,out] Y +!> \verbatim +!> Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array +!> > On entry, Y contains the data snapshot matrix Y +!> < On exit, +!> If JOBR == 'R', the leading K columns of Y contain +!> the residual vectors for the computed Ritz pairs. +!> See the description of RES. +!> If JOBR == 'N', Y contains the original input data, +!> scaled according to the value of JOBS. +!> \endverbatim +!..... +!> \param[in] LDY +!> \verbatim +!> LDY (input) INTEGER , LDY >= M +!> The leading dimension of the array Y. +!> \endverbatim +!..... +!> \param[in] NRNK +!> \verbatim +!> NRNK (input) INTEGER +!> Determines the mode how to compute the numerical rank, +!> i.e. how to truncate small singular values of the input +!> matrix X. On input, if +!> NRNK = -1 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(1) +!> This option is recommended. +!> NRNK = -2 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(i-1) +!> This option is included for R&D purposes. +!> It requires highly accurate SVD, which +!> may not be feasible. +!> The numerical rank can be enforced by using positive +!> value of NRNK as follows: +!> 0 < NRNK <= N :: at most NRNK largest singular values +!> will be used. If the number of the computed nonzero +!> singular values is less than NRNK, then only those +!> nonzero values will be used and the actually used +!> dimension is less than NRNK. The actual number of +!> the nonzero singular values is returned in the variable +!> K. See the descriptions of TOL and K. +!> \endverbatim +!..... +!> \param[in] TOL +!> \verbatim +!> TOL (input) REAL(KIND=WP), 0 <= TOL < 1 +!> The tolerance for truncating small singular values. +!> See the description of NRNK. +!> \endverbatim +!..... +!> \param[out] K +!> \verbatim +!> K (output) INTEGER, 0 <= K <= N +!> The dimension of the POD basis for the data snapshot +!> matrix X and the number of the computed Ritz pairs. +!> The value of K is determined according to the rule set +!> by the parameters NRNK and TOL. +!> See the descriptions of NRNK and TOL. +!> \endverbatim +!..... +!> \param[out] EIGS +!> \verbatim +!> EIGS (output) COMPLEX(KIND=WP) N-by-1 array +!> The leading K (K<=N) entries of EIGS contain +!> the computed eigenvalues (Ritz values). +!> See the descriptions of K, and Z. +!> \endverbatim +!..... +!> \param[out] Z +!> \verbatim +!> Z (workspace/output) COMPLEX(KIND=WP) M-by-N array +!> If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) +!> is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. +!> If JOBZ == 'F', then the Z(:,i)'s are given implicitly as +!> the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i) +!> is an eigenvector corresponding to EIGS(i). The columns +!> of W(1:k,1:K) are the computed eigenvectors of the +!> K-by-K Rayleigh quotient. +!> See the descriptions of EIGS, X and W. +!> \endverbatim +!..... +!> \param[in] LDZ +!> \verbatim +!> LDZ (input) INTEGER , LDZ >= M +!> The leading dimension of the array Z. +!> \endverbatim +!..... +!> \param[out] RES +!> \verbatim +!> RES (output) REAL(KIND=WP) N-by-1 array +!> RES(1:K) contains the residuals for the K computed +!> Ritz pairs, +!> RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. +!> See the description of EIGS and Z. +!> \endverbatim +!..... +!> \param[out] B +!> \verbatim +!> B (output) COMPLEX(KIND=WP) M-by-N array. +!> IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can +!> be used for computing the refined vectors; see further +!> details in the provided references. +!> If JOBF == 'E', B(1:M,1:K) contains +!> A*U(:,1:K)*W(1:K,1:K), which are the vectors from the +!> Exact DMD, up to scaling by the inverse eigenvalues. +!> If JOBF =='N', then B is not referenced. +!> See the descriptions of X, W, K. +!> \endverbatim +!..... +!> \param[in] LDB +!> \verbatim +!> LDB (input) INTEGER, LDB >= M +!> The leading dimension of the array B. +!> \endverbatim +!..... +!> \param[out] W +!> \verbatim +!> W (workspace/output) COMPLEX(KIND=WP) N-by-N array +!> On exit, W(1:K,1:K) contains the K computed +!> eigenvectors of the matrix Rayleigh quotient. +!> The Ritz vectors (returned in Z) are the +!> product of X (containing a POD basis for the input +!> matrix X) and W. See the descriptions of K, S, X and Z. +!> W is also used as a workspace to temporarily store the +!> right singular vectors of X. +!> \endverbatim +!..... +!> \param[in] LDW +!> \verbatim +!> LDW (input) INTEGER, LDW >= N +!> The leading dimension of the array W. +!> \endverbatim +!..... +!> \param[out] S +!> \verbatim +!> S (workspace/output) COMPLEX(KIND=WP) N-by-N array +!> The array S(1:K,1:K) is used for the matrix Rayleigh +!> quotient. This content is overwritten during +!> the eigenvalue decomposition by CGEEV. +!> See the description of K. +!> \endverbatim +!..... +!> \param[in] LDS +!> \verbatim +!> LDS (input) INTEGER, LDS >= N +!> The leading dimension of the array S. +!> \endverbatim +!..... +!> \param[out] ZWORK +!> \verbatim +!> ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array +!> ZWORK is used as complex workspace in the complex SVD, as +!> specified by WHTSVD (1,2, 3 or 4) and for CGEEV for computing +!> the eigenvalues of a Rayleigh quotient. +!> If the call to CGEDMD is only workspace query, then +!> ZWORK(1) contains the minimal complex workspace length and +!> ZWORK(2) is the optimal complex workspace length. +!> Hence, the length of work is at least 2. +!> See the description of LZWORK. +!> \endverbatim +!..... +!> \param[in] LZWORK +!> \verbatim +!> LZWORK (input) INTEGER +!> The minimal length of the workspace vector ZWORK. +!> LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_CGEEV), +!> where LZWORK_CGEEV = MAX( 1, 2*N ) and the minimal +!> LZWORK_SVD is calculated as follows +!> If WHTSVD == 1 :: CGESVD :: +!> LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N)) +!> If WHTSVD == 2 :: CGESDD :: +!> LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) +!> If WHTSVD == 3 :: CGESVDQ :: +!> LZWORK_SVD = obtainable by a query +!> If WHTSVD == 4 :: CGEJSV :: +!> LZWORK_SVD = obtainable by a query +!> If on entry LZWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths and returns them in +!> LZWORK(1) and LZWORK(2), respectively. +!> \endverbatim +!..... +!> \param[out] RWORK +!> \verbatim +!> RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array +!> On exit, RWORK(1:N) contains the singular values of +!> X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). +!> If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain +!> scaling factor RWORK(N+2)/RWORK(N+1) used to scale X +!> and Y to avoid overflow in the SVD of X. +!> This may be of interest if the scaling option is off +!> and as many as possible smallest eigenvalues are +!> desired to the highest feasible accuracy. +!> If the call to CGEDMD is only workspace query, then +!> RWORK(1) contains the minimal workspace length. +!> See the description of LRWORK. +!> \endverbatim +!..... +!> \param[in] LRWORK +!> \verbatim +!> LRWORK (input) INTEGER +!> The minimal length of the workspace vector RWORK. +!> LRWORK is calculated as follows: +!> LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_CGEEV), where +!> LRWORK_CGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace +!> for the SVD subroutine determined by the input parameter +!> WHTSVD. +!> If WHTSVD == 1 :: CGESVD :: +!> LRWORK_SVD = 5*MIN(M,N) +!> If WHTSVD == 2 :: CGESDD :: +!> LRWORK_SVD = MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), +!> 2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) +!> If WHTSVD == 3 :: CGESVDQ :: +!> LRWORK_SVD = obtainable by a query +!> If WHTSVD == 4 :: CGEJSV :: +!> LRWORK_SVD = obtainable by a query +!> If on entry LRWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> real workspace length and returns it in RWORK(1). +!> \endverbatim +!..... +!> \param[out] IWORK +!> \verbatim +!> IWORK (workspace/output) INTEGER LIWORK-by-1 array +!> Workspace that is required only if WHTSVD equals +!> 2 , 3 or 4. (See the description of WHTSVD). +!> If on entry LWORK =-1 or LIWORK=-1, then the +!> minimal length of IWORK is computed and returned in +!> IWORK(1). See the description of LIWORK. +!> \endverbatim +!..... +!> \param[in] LIWORK +!> \verbatim +!> LIWORK (input) INTEGER +!> The minimal length of the workspace vector IWORK. +!> If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 +!> If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) +!> If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) +!> If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) +!> If on entry LIWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths for ZWORK, RWORK and +!> IWORK. See the descriptions of ZWORK, RWORK and IWORK. +!> \endverbatim +!..... +!> \param[out] INFO +!> \verbatim +!> INFO (output) INTEGER +!> -i < 0 :: On entry, the i-th argument had an +!> illegal value +!> = 0 :: Successful return. +!> = 1 :: Void input. Quick exit (M=0 or N=0). +!> = 2 :: The SVD computation of X did not converge. +!> Suggestion: Check the input data and/or +!> repeat with different WHTSVD. +!> = 3 :: The computation of the eigenvalues did not +!> converge. +!> = 4 :: If data scaling was requested on input and +!> the procedure found inconsistency in the data +!> such that for some column index i, +!> X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set +!> to zero if JOBS=='C'. The computation proceeds +!> with original or modified data and warning +!> flag is set with INFO=4. +!> \endverbatim +! +! Authors: +! ======== +! +!> \author Zlatko Drmac +! +!> \ingroup gedmd +! +!............................................................. +!............................................................. SUBROUTINE CGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & M, N, X, LDX, Y, LDY, NRNK, TOL, & K, EIGS, Z, LDZ, RES, B, LDB, & W, LDW, S, LDS, ZWORK, LZWORK, & RWORK, LRWORK, IWORK, LIWORK, INFO ) -! March 2023 +! +! -- LAPACK driver routine -- +! +! -- LAPACK is a software package provided by University of -- +! -- Tennessee, University of California Berkeley, University of -- +! -- Colorado Denver and NAG Ltd.. -- +! !..... USE iso_fortran_env IMPLICIT NONE INTEGER, PARAMETER :: WP = real32 -!..... +! ! Scalar arguments +! ~~~~~~~~~~~~~~~~ CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & NRNK, LDZ, LDB, LDW, LDS, & LIWORK, LRWORK, LZWORK INTEGER, INTENT(OUT) :: K, INFO REAL(KIND=WP), INTENT(IN) :: TOL +! ! Array arguments +! ~~~~~~~~~~~~~~~ COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) COMPLEX(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & W(LDW,*), S(LDS,*) @@ -25,364 +529,14 @@ REAL(KIND=WP), INTENT(OUT) :: RES(*) REAL(KIND=WP), INTENT(OUT) :: RWORK(*) INTEGER, INTENT(OUT) :: IWORK(*) -!............................................................ -! Purpose -! ======= -! CGEDMD computes the Dynamic Mode Decomposition (DMD) for -! a pair of data snapshot matrices. For the input matrices -! X and Y such that Y = A*X with an unaccessible matrix -! A, CGEDMD computes a certain number of Ritz pairs of A using -! the standard Rayleigh-Ritz extraction from a subspace of -! range(X) that is determined using the leading left singular -! vectors of X. Optionally, CGEDMD returns the residuals -! of the computed Ritz pairs, the information needed for -! a refinement of the Ritz vectors, or the eigenvectors of -! the Exact DMD. -! For further details see the references listed -! below. For more details of the implementation see [3]. -! -! References -! ========== -! [1] P. Schmid: Dynamic mode decomposition of numerical -! and experimental data, -! Journal of Fluid Mechanics 656, 5-28, 2010. -! [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal -! decompositions: analysis and enhancements, -! SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. -! [3] Z. Drmac: A LAPACK implementation of the Dynamic -! Mode Decomposition I. Technical report. AIMDyn Inc. -! and LAPACK Working Note 298. -! [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. -! Brunton, N. Kutz: On Dynamic Mode Decomposition: -! Theory and Applications, Journal of Computational -! Dynamics 1(2), 391 -421, 2014. ! -!...................................................................... -! Developed and supported by: -! =========================== -! Developed and coded by Zlatko Drmac, Faculty of Science, -! University of Zagreb; drmac@math.hr -! In cooperation with -! AIMdyn Inc., Santa Barbara, CA. -! and supported by -! - DARPA SBIR project "Koopman Operator-Based Forecasting -! for Nonstationary Processes from Near-Term, Limited -! Observational Data" Contract No: W31P4Q-21-C-0007 -! - DARPA PAI project "Physics-Informed Machine Learning -! Methodologies" Contract No: HR0011-18-9-0033 -! - DARPA MoDyL project "A Data-Driven, Operator-Theoretic -! Framework for Space-Time Analysis of Process Dynamics" -! Contract No: HR0011-16-C-0116 -! Any opinions, findings and conclusions or recommendations -! expressed in this material are those of the author and -! do not necessarily reflect the views of the DARPA SBIR -! Program Office -!============================================================ -! Distribution Statement A: -! Approved for Public Release, Distribution Unlimited. -! Cleared by DARPA on September 29, 2022 -!============================================================ -!...................................................................... -! Arguments -! ========= -! JOBS (input) CHARACTER*1 -! Determines whether the initial data snapshots are scaled -! by a diagonal matrix. -! 'S' :: The data snapshots matrices X and Y are multiplied -! with a diagonal matrix D so that X*D has unit -! nonzero columns (in the Euclidean 2-norm) -! 'C' :: The snapshots are scaled as with the 'S' option. -! If it is found that an i-th column of X is zero -! vector and the corresponding i-th column of Y is -! non-zero, then the i-th column of Y is set to -! zero and a warning flag is raised. -! 'Y' :: The data snapshots matrices X and Y are multiplied -! by a diagonal matrix D so that Y*D has unit -! nonzero columns (in the Euclidean 2-norm) -! 'N' :: No data scaling. -!..... -! JOBZ (input) CHARACTER*1 -! Determines whether the eigenvectors (Koopman modes) will -! be computed. -! 'V' :: The eigenvectors (Koopman modes) will be computed -! and returned in the matrix Z. -! See the description of Z. -! 'F' :: The eigenvectors (Koopman modes) will be returned -! in factored form as the product X(:,1:K)*W, where X -! contains a POD basis (leading left singular vectors -! of the data matrix X) and W contains the eigenvectors -! of the corresponding Rayleigh quotient. -! See the descriptions of K, X, W, Z. -! 'N' :: The eigenvectors are not computed. -!..... -! JOBR (input) CHARACTER*1 -! Determines whether to compute the residuals. -! 'R' :: The residuals for the computed eigenpairs will be -! computed and stored in the array RES. -! See the description of RES. -! For this option to be legal, JOBZ must be 'V'. -! 'N' :: The residuals are not computed. -!..... -! JOBF (input) CHARACTER*1 -! Specifies whether to store information needed for post- -! processing (e.g. computing refined Ritz vectors) -! 'R' :: The matrix needed for the refinement of the Ritz -! vectors is computed and stored in the array B. -! See the description of B. -! 'E' :: The unscaled eigenvectors of the Exact DMD are -! computed and returned in the array B. See the -! description of B. -! 'N' :: No eigenvector refinement data is computed. -!..... -! WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } -! Allows for a selection of the SVD algorithm from the -! LAPACK library. -! 1 :: CGESVD (the QR SVD algorithm) -! 2 :: CGESDD (the Divide and Conquer algorithm; if enough -! workspace available, this is the fastest option) -! 3 :: CGESVDQ (the preconditioned QR SVD ; this and 4 -! are the most accurate options) -! 4 :: CGEJSV (the preconditioned Jacobi SVD; this and 3 -! are the most accurate options) -! For the four methods above, a significant difference in -! the accuracy of small singular values is possible if -! the snapshots vary in norm so that X is severely -! ill-conditioned. If small (smaller than EPS*||X||) -! singular values are of interest and JOBS=='N', then -! the options (3, 4) give the most accurate results, where -! the option 4 is slightly better and with stronger -! theoretical background. -! If JOBS=='S', i.e. the columns of X will be normalized, -! then all methods give nearly equally accurate results. -!..... -! M (input) INTEGER, M>= 0 -! The state space dimension (the row dimension of X, Y). -!..... -! N (input) INTEGER, 0 <= N <= M -! The number of data snapshot pairs -! (the number of columns of X and Y). -!..... -! X (input/output) COMPLEX(KIND=WP) M-by-N array -! > On entry, X contains the data snapshot matrix X. It is -! assumed that the column norms of X are in the range of -! the normalized floating point numbers. -! < On exit, the leading K columns of X contain a POD basis, -! i.e. the leading K left singular vectors of the input -! data matrix X, U(:,1:K). All N columns of X contain all -! left singular vectors of the input matrix X. -! See the descriptions of K, Z and W. -!..... -! LDX (input) INTEGER, LDX >= M -! The leading dimension of the array X. -!..... -! Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array -! > On entry, Y contains the data snapshot matrix Y -! < On exit, -! If JOBR == 'R', the leading K columns of Y contain -! the residual vectors for the computed Ritz pairs. -! See the description of RES. -! If JOBR == 'N', Y contains the original input data, -! scaled according to the value of JOBS. -!..... -! LDY (input) INTEGER , LDY >= M -! The leading dimension of the array Y. -!..... -! NRNK (input) INTEGER -! Determines the mode how to compute the numerical rank, -! i.e. how to truncate small singular values of the input -! matrix X. On input, if -! NRNK = -1 :: i-th singular value sigma(i) is truncated -! if sigma(i) <= TOL*sigma(1) -! This option is recommended. -! NRNK = -2 :: i-th singular value sigma(i) is truncated -! if sigma(i) <= TOL*sigma(i-1) -! This option is included for R&D purposes. -! It requires highly accurate SVD, which -! may not be feasible. -! The numerical rank can be enforced by using positive -! value of NRNK as follows: -! 0 < NRNK <= N :: at most NRNK largest singular values -! will be used. If the number of the computed nonzero -! singular values is less than NRNK, then only those -! nonzero values will be used and the actually used -! dimension is less than NRNK. The actual number of -! the nonzero singular values is returned in the variable -! K. See the descriptions of TOL and K. -!..... -! TOL (input) REAL(KIND=WP), 0 <= TOL < 1 -! The tolerance for truncating small singular values. -! See the description of NRNK. -!..... -! K (output) INTEGER, 0 <= K <= N -! The dimension of the POD basis for the data snapshot -! matrix X and the number of the computed Ritz pairs. -! The value of K is determined according to the rule set -! by the parameters NRNK and TOL. -! See the descriptions of NRNK and TOL. -!..... -! EIGS (output) COMPLEX(KIND=WP) N-by-1 array -! The leading K (K<=N) entries of EIGS contain -! the computed eigenvalues (Ritz values). -! See the descriptions of K, and Z. -!..... -! Z (workspace/output) COMPLEX(KIND=WP) M-by-N array -! If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) -! is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. -! If JOBZ == 'F', then the Z(:,i)'s are given implicitly as -! the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i) -! is an eigenvector corresponding to EIGS(i). The columns -! of W(1:k,1:K) are the computed eigenvectors of the -! K-by-K Rayleigh quotient. -! See the descriptions of EIGS, X and W. -!..... -! LDZ (input) INTEGER , LDZ >= M -! The leading dimension of the array Z. -!..... -! RES (output) REAL(KIND=WP) N-by-1 array -! RES(1:K) contains the residuals for the K computed -! Ritz pairs, -! RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. -! See the description of EIGS and Z. -!..... -! B (output) COMPLEX(KIND=WP) M-by-N array. -! IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can -! be used for computing the refined vectors; see further -! details in the provided references. -! If JOBF == 'E', B(1:M,1:K) contains -! A*U(:,1:K)*W(1:K,1:K), which are the vectors from the -! Exact DMD, up to scaling by the inverse eigenvalues. -! If JOBF =='N', then B is not referenced. -! See the descriptions of X, W, K. -!..... -! LDB (input) INTEGER, LDB >= M -! The leading dimension of the array B. -!..... -! W (workspace/output) COMPLEX(KIND=WP) N-by-N array -! On exit, W(1:K,1:K) contains the K computed -! eigenvectors of the matrix Rayleigh quotient. -! The Ritz vectors (returned in Z) are the -! product of X (containing a POD basis for the input -! matrix X) and W. See the descriptions of K, S, X and Z. -! W is also used as a workspace to temporarily store the -! right singular vectors of X. -!..... -! LDW (input) INTEGER, LDW >= N -! The leading dimension of the array W. -!..... -! S (workspace/output) COMPLEX(KIND=WP) N-by-N array -! The array S(1:K,1:K) is used for the matrix Rayleigh -! quotient. This content is overwritten during -! the eigenvalue decomposition by CGEEV. -! See the description of K. -!..... -! LDS (input) INTEGER, LDS >= N -! The leading dimension of the array S. -!..... -! ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array -! ZWORK is used as complex workspace in the complex SVD, as -! specified by WHTSVD (1,2, 3 or 4) and for CGEEV for computing -! the eigenvalues of a Rayleigh quotient. -! If the call to CGEDMD is only workspace query, then -! ZWORK(1) contains the minimal complex workspace length and -! ZWORK(2) is the optimal complex workspace length. -! Hence, the length of work is at least 2. -! See the description of LZWORK. -!..... -! LZWORK (input) INTEGER -! The minimal length of the workspace vector ZWORK. -! LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_CGEEV), -! where LZWORK_CGEEV = MAX( 1, 2*N ) and the minimal -! LZWORK_SVD is calculated as follows -! If WHTSVD == 1 :: CGESVD :: -! LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N)) -! If WHTSVD == 2 :: CGESDD :: -! LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) -! If WHTSVD == 3 :: CGESVDQ :: -! LZWORK_SVD = obtainable by a query -! If WHTSVD == 4 :: CGEJSV :: -! LZWORK_SVD = obtainable by a query -! If on entry LZWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! and the optimal workspace lengths and returns them in -! LZWORK(1) and LZWORK(2), respectively. -!..... -! RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array -! On exit, RWORK(1:N) contains the singular values of -! X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). -! If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain -! scaling factor RWORK(N+2)/RWORK(N+1) used to scale X -! and Y to avoid overflow in the SVD of X. -! This may be of interest if the scaling option is off -! and as many as possible smallest eigenvalues are -! desired to the highest feasible accuracy. -! If the call to CGEDMD is only workspace query, then -! RWORK(1) contains the minimal workspace length. -! See the description of LRWORK. -!..... -! LRWORK (input) INTEGER -! The minimal length of the workspace vector RWORK. -! LRWORK is calculated as follows: -! LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_CGEEV), where -! LRWORK_CGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace -! for the SVD subroutine determined by the input parameter -! WHTSVD. -! If WHTSVD == 1 :: CGESVD :: -! LRWORK_SVD = 5*MIN(M,N) -! If WHTSVD == 2 :: CGESDD :: -! LRWORK_SVD = MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), -! 2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) -! If WHTSVD == 3 :: CGESVDQ :: -! LRWORK_SVD = obtainable by a query -! If WHTSVD == 4 :: CGEJSV :: -! LRWORK_SVD = obtainable by a query -! If on entry LRWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! real workspace length and returns it in RWORK(1). -!..... -! IWORK (workspace/output) INTEGER LIWORK-by-1 array -! Workspace that is required only if WHTSVD equals -! 2 , 3 or 4. (See the description of WHTSVD). -! If on entry LWORK =-1 or LIWORK=-1, then the -! minimal length of IWORK is computed and returned in -! IWORK(1). See the description of LIWORK. -!..... -! LIWORK (input) INTEGER -! The minimal length of the workspace vector IWORK. -! If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 -! If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) -! If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) -! If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) -! If on entry LIWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! and the optimal workspace lengths for ZWORK, RWORK and -! IWORK. See the descriptions of ZWORK, RWORK and IWORK. -!..... -! INFO (output) INTEGER -! -i < 0 :: On entry, the i-th argument had an -! illegal value -! = 0 :: Successful return. -! = 1 :: Void input. Quick exit (M=0 or N=0). -! = 2 :: The SVD computation of X did not converge. -! Suggestion: Check the input data and/or -! repeat with different WHTSVD. -! = 3 :: The computation of the eigenvalues did not -! converge. -! = 4 :: If data scaling was requested on input and -! the procedure found inconsistency in the data -! such that for some column index i, -! X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set -! to zero if JOBS=='C'. The computation proceeds -! with original or modified data and warning -! flag is set with INFO=4. -!............................................................. -!............................................................. ! Parameters ! ~~~~~~~~~~ REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP COMPLEX(KIND=WP), PARAMETER :: ZONE = ( 1.0_WP, 0.0_WP ) COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP ) - +! ! Local scalars ! ~~~~~~~~~~~~~ REAL(KIND=WP) :: OFL, ROOTSC, SCALE, SMALL, & @@ -400,7 +554,7 @@ ! Local arrays ! ~~~~~~~~~~~~ REAL(KIND=WP) :: RDUMMY(2) - +! ! External functions (BLAS and LAPACK) ! ~~~~~~~~~~~~~~~~~ REAL(KIND=WP) CLANGE, SLAMCH, SCNRM2 @@ -408,13 +562,13 @@ INTEGER ICAMAX LOGICAL SISNAN, LSAME EXTERNAL SISNAN, LSAME - +! ! External subroutines (BLAS and LAPACK) ! ~~~~~~~~~~~~~~~~~~~~ EXTERNAL CAXPY, CGEMM, CSSCAL EXTERNAL CGEEV, CGEJSV, CGESDD, CGESVD, CGESVDQ, & CLACPY, CLASCL, CLASSQ, XERBLA - +! ! Intrinsic functions ! ~~~~~~~~~~~~~~~~~~~ INTRINSIC FLOAT, INT, MAX, SQRT @@ -607,7 +761,8 @@ K = 0 DO i = 1, N !WORK(i) = SCNRM2( M, X(1,i), 1 ) - SCALE = ZERO + SSUM = ONE + SCALE = ZERO CALL CLASSQ( M, X(1,i), 1, SCALE, SSUM ) IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN K = 0 @@ -680,7 +835,8 @@ ! carefully computed using CLASSQ. DO i = 1, N !RWORK(i) = SCNRM2( M, Y(1,i), 1 ) - SCALE = ZERO + SSUM = ONE + SCALE = ZERO CALL CLASSQ( M, Y(1,i), 1, SCALE, SSUM ) IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN K = 0 diff --git a/lapack-netlib/SRC/cgehrd.f b/lapack-netlib/SRC/cgehrd.f index f407f931a..7ba87cc01 100644 --- a/lapack-netlib/SRC/cgehrd.f +++ b/lapack-netlib/SRC/cgehrd.f @@ -89,7 +89,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension (LWORK) +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -222,13 +222,19 @@ INFO = -8 END IF * + NH = IHI - ILO + 1 IF( INFO.EQ.0 ) THEN * * Compute the workspace requirements * - NB = MIN( NBMAX, ILAENV( 1, 'CGEHRD', ' ', N, ILO, IHI, -1 ) ) - LWKOPT = N*NB + TSIZE - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + NB = MIN( NBMAX, ILAENV( 1, 'DGEHRD', ' ', N, ILO, IHI, + $ -1 ) ) + LWKOPT = N*NB + TSIZE + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -249,7 +255,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = 1 RETURN @@ -269,7 +274,7 @@ * * Determine if workspace is large enough for blocked code * - IF( LWORK.LT.N*NB+TSIZE ) THEN + IF( LWORK.LT.LWKOPT ) THEN * * Not enough workspace to use optimal NB: determine the * minimum value of NB, and reduce NB or force use of @@ -345,7 +350,8 @@ * Use unblocked code to reduce the rest of the matrix * CALL CGEHD2( N, I, IHI, A, LDA, TAU, WORK, IINFO ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/cgelq.f b/lapack-netlib/SRC/cgelq.f index ff482bc42..24aaa982e 100644 --- a/lapack-netlib/SRC/cgelq.f +++ b/lapack-netlib/SRC/cgelq.f @@ -98,7 +98,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -295,9 +295,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) ELSE - WORK( 1 ) = SROUNDUP_LWORK(LWREQ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) END IF END IF IF( INFO.NE.0 ) THEN @@ -322,7 +322,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWREQ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) * RETURN * diff --git a/lapack-netlib/SRC/cgelqf.f b/lapack-netlib/SRC/cgelqf.f index 75f5bc960..3847a958a 100644 --- a/lapack-netlib/SRC/cgelqf.f +++ b/lapack-netlib/SRC/cgelqf.f @@ -93,7 +93,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,M). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M, otherwise. *> For optimum performance LWORK >= M*NB, where NB is the *> optimal blocksize. *> @@ -175,9 +176,8 @@ * Test the input arguments * INFO = 0 + K = MIN( M, N ) NB = ILAENV( 1, 'CGELQF', ' ', M, N, -1, -1 ) - LWKOPT = M*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -185,19 +185,25 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + ELSE IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) ) + $ INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CGELQF', -INFO ) RETURN ELSE IF( LQUERY ) THEN + IF( K.EQ.0 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = M*NB + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -267,7 +273,7 @@ $ CALL CGELQ2( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK, $ IINFO ) * - WORK( 1 ) = SROUNDUP_LWORK(IWS) + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of CGELQF diff --git a/lapack-netlib/SRC/cgemlq.f b/lapack-netlib/SRC/cgemlq.f index e0cf78bc0..e5b02b669 100644 --- a/lapack-netlib/SRC/cgemlq.f +++ b/lapack-netlib/SRC/cgemlq.f @@ -110,16 +110,17 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -143,7 +144,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -159,11 +160,13 @@ *> block sizes MB and NB returned by ILAENV, CGELQ will use either *> CLASWLQ (if the matrix is wide-and-short) or CGELQT to compute *> the LQ factorization. -*> This version of CGEMLQ will use either CLAMSWLQ or CGEMLQT to +*> This version of CGEMLQ will use either CLAMSWLQ or CGEMLQT to *> multiply matrix Q by another matrix. *> Further Details in CLAMSWLQ or CGEMLQT. *> \endverbatim *> +*> \ingroup gemlq +*> * ===================================================================== SUBROUTINE CGEMLQ( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -185,11 +188,12 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLAMSWLQ, CGEMLQT, XERBLA @@ -201,7 +205,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) @@ -216,6 +220,13 @@ LW = M * MB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( NB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, NB - K ) .EQ. 0 ) THEN @@ -244,12 +255,12 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( ( LWORK.LT.LWMIN ) .AND. ( .NOT.LQUERY ) ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = REAL( LW ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -261,7 +272,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -274,7 +285,7 @@ $ MB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = REAL( LW ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * RETURN * diff --git a/lapack-netlib/SRC/cgemqr.f b/lapack-netlib/SRC/cgemqr.f index ea9de146e..0b7dd9dd7 100644 --- a/lapack-netlib/SRC/cgemqr.f +++ b/lapack-netlib/SRC/cgemqr.f @@ -111,16 +111,17 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -144,7 +145,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -166,6 +167,8 @@ *> *> \endverbatim *> +*> \ingroup gemqr +*> * ===================================================================== SUBROUTINE CGEMQR( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -187,11 +190,12 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CGEMQRT, CLAMTSQR, XERBLA @@ -203,7 +207,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) @@ -218,6 +222,13 @@ LW = MB * NB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( MB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, MB - K ).EQ.0 ) THEN @@ -251,7 +262,7 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LW + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -263,7 +274,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -276,7 +287,7 @@ $ NB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = LW + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * RETURN * diff --git a/lapack-netlib/SRC/cgeqlf.f b/lapack-netlib/SRC/cgeqlf.f index 918bbddad..6c67344c5 100644 --- a/lapack-netlib/SRC/cgeqlf.f +++ b/lapack-netlib/SRC/cgeqlf.f @@ -88,7 +88,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is *> the optimal blocksize. *> @@ -187,10 +188,11 @@ NB = ILAENV( 1, 'CGEQLF', ' ', M, N, -1, -1 ) LWKOPT = N*NB END IF - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * - IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) ) + $ INFO = -7 END IF END IF * @@ -277,7 +279,7 @@ IF( MU.GT.0 .AND. NU.GT.0 ) $ CALL CGEQL2( MU, NU, A, LDA, TAU, WORK, IINFO ) * - WORK( 1 ) = SROUNDUP_LWORK(IWS) + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of CGEQLF diff --git a/lapack-netlib/SRC/cgeqp3rk.f b/lapack-netlib/SRC/cgeqp3rk.f index 587860684..731c44edb 100644 --- a/lapack-netlib/SRC/cgeqp3rk.f +++ b/lapack-netlib/SRC/cgeqp3rk.f @@ -428,7 +428,8 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*. LWORK >= N+NRHS-1 +*> LWORK >= 1, if MIN(M,N) = 0, and +*> LWORK >= N+NRHS-1, otherwise. *> For optimal performance LWORK >= NB*( N+NRHS+1 ), *> where NB is the optimal block size for CGEQP3RK returned *> by ILAENV. Minimal block size MINNB=2. @@ -627,8 +628,9 @@ * .. External Functions .. LOGICAL SISNAN INTEGER ISAMAX, ILAENV - REAL SLAMCH, SCNRM2 - EXTERNAL SISNAN, SLAMCH, SCNRM2, ISAMAX, ILAENV + REAL SLAMCH, SCNRM2, SROUNDUP_LWORK + EXTERNAL SISNAN, SLAMCH, SCNRM2, ISAMAX, ILAENV, + $ SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, MIN @@ -703,7 +705,7 @@ * LWKOPT = 2*N + NB*( N+NRHS+1 ) END IF - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN INFO = -15 @@ -726,7 +728,7 @@ K = 0 MAXC2NRMK = ZERO RELMAXC2NRMK = ZERO - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * @@ -778,7 +780,7 @@ * * Array TAU is not set and contains undefined elements. * - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * @@ -797,7 +799,7 @@ TAU( J ) = CZERO END DO * - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * END IF @@ -828,7 +830,7 @@ DO J = 1, MINMN TAU( J ) = CZERO END DO - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * @@ -873,7 +875,7 @@ TAU( J ) = CZERO END DO * - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * @@ -991,7 +993,7 @@ * * Return from the routine. * - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * @@ -1082,7 +1084,7 @@ * END IF * - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/cgeqr.f b/lapack-netlib/SRC/cgeqr.f index d10e3da65..3617594d0 100644 --- a/lapack-netlib/SRC/cgeqr.f +++ b/lapack-netlib/SRC/cgeqr.f @@ -99,7 +99,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -168,6 +168,8 @@ *> *> \endverbatim *> +*> \ingroup geqr +*> * ===================================================================== SUBROUTINE CGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) @@ -188,11 +190,12 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, LMINWS, MINT, MINW - INTEGER MB, NB, MINTSZ, NBLCKS + INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWREQ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLATSQR, CGEQRT, XERBLA @@ -244,8 +247,10 @@ * * Determine if the workspace size satisfies minimal size * + LWMIN = MAX( 1, N ) + LWREQ = MAX( 1, N*NB ) LMINWS = .FALSE. - IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.NB*N ) + IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.LWREQ ) $ .AND. ( LWORK.GE.N ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( .NOT.LQUERY ) ) THEN IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) ) THEN @@ -253,7 +258,7 @@ NB = 1 MB = M END IF - IF( LWORK.LT.NB*N ) THEN + IF( LWORK.LT.LWREQ ) THEN LMINWS = .TRUE. NB = 1 END IF @@ -268,7 +273,7 @@ ELSE IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN INFO = -6 - ELSE IF( ( LWORK.LT.MAX( 1, N*NB ) ) .AND. ( .NOT.LQUERY ) + ELSE IF( ( LWORK.LT.LWREQ ) .AND. ( .NOT.LQUERY ) $ .AND. ( .NOT.LMINWS ) ) THEN INFO = -8 END IF @@ -282,9 +287,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = MAX( 1, N ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) ELSE - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) END IF END IF IF( INFO.NE.0 ) THEN @@ -309,7 +314,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) * RETURN * diff --git a/lapack-netlib/SRC/cgeqrfp.f b/lapack-netlib/SRC/cgeqrfp.f index eaf98ddf3..5b6226c67 100644 --- a/lapack-netlib/SRC/cgeqrfp.f +++ b/lapack-netlib/SRC/cgeqrfp.f @@ -97,7 +97,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is *> the optimal blocksize. *> @@ -162,8 +163,8 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKOPT, NB, - $ NBMIN, NX + INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKMIN, LWKOPT, + $ NB, NBMIN, NX * .. * .. External Subroutines .. EXTERNAL CGEQR2P, CLARFB, CLARFT, XERBLA @@ -182,8 +183,16 @@ * INFO = 0 NB = ILAENV( 1, 'CGEQRF', ' ', M, N, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + K = MIN( M, N ) + IF( K.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = N + LWKOPT = N*NB + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) +* LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -191,7 +200,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF IF( INFO.NE.0 ) THEN @@ -203,7 +212,6 @@ * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -211,7 +219,7 @@ * NBMIN = 2 NX = 0 - IWS = N + IWS = LWKMIN IF( NB.GT.1 .AND. NB.LT.K ) THEN * * Determine when to cross over from blocked to unblocked code. @@ -273,7 +281,7 @@ $ CALL CGEQR2P( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK, $ IINFO ) * - WORK( 1 ) = SROUNDUP_LWORK(IWS) + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of CGEQRFP diff --git a/lapack-netlib/SRC/cgesvdx.f b/lapack-netlib/SRC/cgesvdx.f index fbdb121ca..e1856a65f 100644 --- a/lapack-netlib/SRC/cgesvdx.f +++ b/lapack-netlib/SRC/cgesvdx.f @@ -208,7 +208,7 @@ *> \param[out] WORK *> \verbatim *> WORK is COMPLEX array, dimension (MAX(1,LWORK)) -*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK; +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK @@ -261,7 +261,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEsing +*> \ingroup gesvdx * * ===================================================================== SUBROUTINE CGESVDX( JOBU, JOBVT, RANGE, M, N, A, LDA, VL, VU, @@ -312,8 +312,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, CLANGE - EXTERNAL LSAME, ILAENV, SLAMCH, CLANGE + REAL SLAMCH, CLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, CLANGE, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, SQRT @@ -448,7 +448,7 @@ END IF END IF MAXWRK = MAX( MAXWRK, MINWRK ) - WORK( 1 ) = CMPLX( REAL( MAXWRK ), ZERO ) + WORK( 1 ) = SROUNDUP_LWORK( MAXWRK ) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN INFO = -19 @@ -464,7 +464,7 @@ * * Quick return if possible * - IF( M.EQ.0 .OR. N.EQ.0 ) THEN + IF( MINMN.EQ.0 ) THEN RETURN END IF * @@ -846,7 +846,7 @@ * * Return optimal workspace in WORK(1) * - WORK( 1 ) = CMPLX( REAL( MAXWRK ), ZERO ) + WORK( 1 ) = SROUNDUP_LWORK( MAXWRK ) * RETURN * diff --git a/lapack-netlib/SRC/cgesvj.f b/lapack-netlib/SRC/cgesvj.f index 149cf5e48..b9c8f1709 100644 --- a/lapack-netlib/SRC/cgesvj.f +++ b/lapack-netlib/SRC/cgesvj.f @@ -208,15 +208,17 @@ *> \verbatim *> CWORK is COMPLEX array, dimension (max(1,LWORK)) *> Used as workspace. -*> If on entry LWORK = -1, then a workspace query is assumed and -*> no computation is done; CWORK(1) is set to the minial (and optimal) -*> length of CWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER. -*> Length of CWORK, LWORK >= M+N. +*> Length of CWORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M+N, otherwise. +*> +*> If on entry LWORK = -1, then a workspace query is assumed and +*> no computation is done; CWORK(1) is set to the minial (and optimal) +*> length of CWORK. *> \endverbatim *> *> \param[in,out] RWORK @@ -247,15 +249,17 @@ *> RWORK(6) = the largest absolute value over all sines of the *> Jacobi rotation angles in the last sweep. It can be *> useful for a post festum analysis. -*> If on entry LRWORK = -1, then a workspace query is assumed and -*> no computation is done; RWORK(1) is set to the minial (and optimal) -*> length of RWORK. *> \endverbatim *> *> \param[in] LRWORK *> \verbatim *> LRWORK is INTEGER -*> Length of RWORK, LRWORK >= MAX(6,N). +*> Length of RWORK. +*> LRWORK >= 1, if MIN(M,N) = 0, and LRWORK >= MAX(6,N), otherwise +*> +*> If on entry LRWORK = -1, then a workspace query is assumed and +*> no computation is done; RWORK(1) is set to the minial (and optimal) +*> length of RWORK. *> \endverbatim *> *> \param[out] INFO @@ -276,7 +280,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEcomputational +*> \ingroup gesvj * *> \par Further Details: * ===================== @@ -374,16 +378,17 @@ PARAMETER ( NSWEEP = 30 ) * .. * .. Local Scalars .. - COMPLEX AAPQ, OMPQ - REAL AAPP, AAPP0, AAPQ1, AAQQ, APOAQ, AQOAP, BIG, - $ BIGTHETA, CS, CTOL, EPSLN, MXAAPQ, - $ MXSINJ, ROOTBIG, ROOTEPS, ROOTSFMIN, ROOTTOL, - $ SKL, SFMIN, SMALL, SN, T, TEMP1, THETA, THSIGN, TOL - INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1, - $ ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34, - $ N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, SWBAND - LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, ROTOK, - $ RSVEC, UCTOL, UPPER + COMPLEX AAPQ, OMPQ + REAL AAPP, AAPP0, AAPQ1, AAQQ, APOAQ, AQOAP, BIG, + $ BIGTHETA, CS, CTOL, EPSLN, MXAAPQ, + $ MXSINJ, ROOTBIG, ROOTEPS, ROOTSFMIN, ROOTTOL, + $ SKL, SFMIN, SMALL, SN, T, TEMP1, THETA, THSIGN, TOL + INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1, + $ ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34, + $ N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, SWBAND, + $ MINMN, LWMIN, LRWMIN + LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, ROTOK, + $ RSVEC, UCTOL, UPPER * .. * .. * .. Intrinsic Functions .. @@ -398,8 +403,8 @@ INTEGER ISAMAX EXTERNAL ISAMAX * from LAPACK - REAL SLAMCH - EXTERNAL SLAMCH + REAL SLAMCH, SROUNDUP_LWORK + EXTERNAL SLAMCH, SROUNDUP_LWORK LOGICAL LSAME EXTERNAL LSAME * .. @@ -422,7 +427,16 @@ UPPER = LSAME( JOBA, 'U' ) LOWER = LSAME( JOBA, 'L' ) * - LQUERY = ( LWORK .EQ. -1 ) .OR. ( LRWORK .EQ. -1 ) + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + LRWMIN = 1 + ELSE + LWMIN = M + N + LRWMIN = MAX( 6, N ) + END IF +* + LQUERY = ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 ) IF( .NOT.( UPPER .OR. LOWER .OR. LSAME( JOBA, 'G' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( LSVEC .OR. UCTOL .OR. LSAME( JOBU, 'N' ) ) ) THEN @@ -442,9 +456,9 @@ INFO = -11 ELSE IF( UCTOL .AND. ( RWORK( 1 ).LE.ONE ) ) THEN INFO = -12 - ELSE IF( LWORK.LT.( M+N ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. ( .NOT.LQUERY ) ) THEN INFO = -13 - ELSE IF( LRWORK.LT.MAX( N, 6 ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LRWORK.LT.LRWMIN .AND. ( .NOT.LQUERY ) ) THEN INFO = -15 ELSE INFO = 0 @@ -454,15 +468,15 @@ IF( INFO.NE.0 ) THEN CALL XERBLA( 'CGESVJ', -INFO ) RETURN - ELSE IF ( LQUERY ) THEN - CWORK(1) = M + N - RWORK(1) = MAX( N, 6 ) + ELSE IF( LQUERY ) THEN + CWORK( 1 ) = SROUNDUP_LWORK( LWMIN ) + RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN ) RETURN END IF * * #:) Quick return for void matrix * - IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) )RETURN + IF( MINMN.EQ.0 ) RETURN * * Set numerical parameters * The stopping criterion for Jacobi rotations is diff --git a/lapack-netlib/SRC/cgetri.f b/lapack-netlib/SRC/cgetri.f index 2060d1444..2eb3da7ab 100644 --- a/lapack-netlib/SRC/cgetri.f +++ b/lapack-netlib/SRC/cgetri.f @@ -153,8 +153,8 @@ * INFO = 0 NB = ILAENV( 1, 'CGETRI', ' ', N, -1, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT = MAX( 1, N*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN INFO = -1 @@ -252,7 +252,7 @@ $ CALL CSWAP( N, A( 1, J ), 1, A( 1, JP ), 1 ) 60 CONTINUE * - WORK( 1 ) = SROUNDUP_LWORK(IWS) + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of CGETRI diff --git a/lapack-netlib/SRC/cgetsls.f b/lapack-netlib/SRC/cgetsls.f index b4bb7562f..3f43dc8de 100644 --- a/lapack-netlib/SRC/cgetsls.f +++ b/lapack-netlib/SRC/cgetsls.f @@ -127,7 +127,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. *> If LWORK = -1, the routine calculates optimal size of WORK for the *> optimal performance and returns this value in WORK(1). @@ -229,7 +229,10 @@ * * Determine the optimum and minimum LWORK * - IF( M.GE.N ) THEN + IF( MIN( M, N, NRHS ).EQ.0 ) THEN + WSIZEO = 1 + WSIZEM = 1 + ELSE IF ( M.GE.N ) THEN CALL CGEQR( M, N, A, LDA, TQ, -1, WORKQ, -1, INFO2 ) TSZO = INT( TQ( 1 ) ) LWO = INT( WORKQ( 1 ) ) diff --git a/lapack-netlib/SRC/cgetsqrhrt.f b/lapack-netlib/SRC/cgetsqrhrt.f index 4e4dc1d4a..087e9bc7f 100644 --- a/lapack-netlib/SRC/cgetsqrhrt.f +++ b/lapack-netlib/SRC/cgetsqrhrt.f @@ -131,13 +131,15 @@ *> \param[in] LWORK *> \verbatim *> The dimension of the array WORK. -*> LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), +*> If MIN(M,N) = 0, LWORK >= 1, else +*> LWORK >= MAX( 1, LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), *> where *> NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)), *> NB1LOCAL = MIN(NB1,N). *> LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL, *> LW1 = NB1LOCAL * N, -*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ), +*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ). +*> *> If LWORK = -1, then a workspace query is assumed. *> The routine only calculates the optimal size of the WORK *> array, returns this value as the first entry of the WORK @@ -160,7 +162,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup comlpexOTHERcomputational +*> \ingroup getsqrhrt * *> \par Contributors: * ================== @@ -200,6 +202,10 @@ INTEGER I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT, $ NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS * .. +* .. External Functions .. + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. External Subroutines .. EXTERNAL CCOPY, CLATSQR, CUNGTSQR_ROW, CUNHR_COL, $ XERBLA @@ -212,7 +218,7 @@ * Test the input arguments * INFO = 0 - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. M.LT.N ) THEN @@ -225,7 +231,7 @@ INFO = -5 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -7 - ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN + ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN INFO = -9 ELSE * @@ -263,8 +269,9 @@ LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ) * LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ) + LWORKOPT = MAX( 1, LWORKOPT ) * - IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + IF( LWORK.LT.LWORKOPT .AND. .NOT.LQUERY ) THEN INFO = -11 END IF * @@ -277,14 +284,14 @@ CALL XERBLA( 'CGETSQRHRT', -INFO ) RETURN ELSE IF ( LQUERY ) THEN - WORK( 1 ) = CMPLX( LWORKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT ) RETURN END IF * * Quick return if possible * IF( MIN( M, N ).EQ.0 ) THEN - WORK( 1 ) = CMPLX( LWORKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT ) RETURN END IF * @@ -341,9 +348,9 @@ END IF END DO * - WORK( 1 ) = CMPLX( LWORKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT ) RETURN * * End of CGETSQRHRT * - END \ No newline at end of file + END diff --git a/lapack-netlib/SRC/cgges3.f b/lapack-netlib/SRC/cgges3.f index aac9f9510..c1ca79688 100644 --- a/lapack-netlib/SRC/cgges3.f +++ b/lapack-netlib/SRC/cgges3.f @@ -215,7 +215,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= MAX(1,2*N). +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -260,7 +261,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEeigen +*> \ingroup gges3 * * ===================================================================== SUBROUTINE CGGES3( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B, @@ -300,7 +301,8 @@ LOGICAL CURSL, ILASCL, ILBSCL, ILVSL, ILVSR, LASTSL, $ LQUERY, WANTST INTEGER I, ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, - $ ILO, IRIGHT, IROWS, IRWRK, ITAU, IWRK, LWKOPT + $ ILO, IRIGHT, IROWS, IRWRK, ITAU, IWRK, LWKOPT, + $ LWKMIN REAL ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, PVSL, $ PVSR, SMLNUM * .. @@ -310,13 +312,12 @@ * .. * .. External Subroutines .. EXTERNAL CGEQRF, CGGBAK, CGGBAL, CGGHD3, CLAQZ0, CLACPY, - $ CLASCL, CLASET, CTGSEN, CUNGQR, CUNMQR, SLABAD, - $ XERBLA + $ CLASCL, CLASET, CTGSEN, CUNGQR, CUNMQR, XERBLA * .. * .. External Functions .. LOGICAL LSAME - REAL CLANGE, SLAMCH - EXTERNAL LSAME, CLANGE, SLAMCH + REAL CLANGE, SLAMCH, SROUNDUP_LWORK + EXTERNAL LSAME, CLANGE, SLAMCH, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, SQRT @@ -353,6 +354,8 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N ) +* IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -369,7 +372,7 @@ INFO = -14 ELSE IF( LDVSR.LT.1 .OR. ( ILVSR .AND. LDVSR.LT.N ) ) THEN INFO = -16 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -18 END IF * @@ -377,29 +380,33 @@ * IF( INFO.EQ.0 ) THEN CALL CGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( 1, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, N + INT( WORK( 1 ) ) ) CALL CUNMQR( 'L', 'C', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) - LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) ) IF( ILVSL ) THEN CALL CUNGQR( N, N, N, VSL, LDVSL, WORK, WORK, -1, $ IERR ) - LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) ) END IF CALL CGGHD3( JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, VSL, $ LDVSL, VSR, LDVSR, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) ) CALL CLAQZ0( 'S', JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, $ ALPHA, BETA, VSL, LDVSL, VSR, LDVSR, WORK, -1, $ RWORK, 0, IERR ) - LWKOPT = MAX( LWKOPT, INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) ) IF( WANTST ) THEN CALL CTGSEN( 0, ILVSL, ILVSR, BWORK, N, A, LDA, B, LDB, $ ALPHA, BETA, VSL, LDVSL, VSR, LDVSR, SDIM, $ PVSL, PVSR, DIF, WORK, -1, IDUM, 1, IERR ) - LWKOPT = MAX( LWKOPT, INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) ) + END IF + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF - WORK( 1 ) = CMPLX( LWKOPT ) END IF * @@ -422,7 +429,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -585,7 +591,7 @@ * 30 CONTINUE * - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/cggev3.f b/lapack-netlib/SRC/cggev3.f index 9483ecdeb..d2b75aebc 100644 --- a/lapack-netlib/SRC/cggev3.f +++ b/lapack-netlib/SRC/cggev3.f @@ -174,7 +174,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= MAX(1,2*N). +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -208,7 +209,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEeigen +*> \ingroup ggev3 * * ===================================================================== SUBROUTINE CGGEV3( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHA, BETA, @@ -243,7 +244,7 @@ CHARACTER CHTEMP INTEGER ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, ILO, $ IN, IRIGHT, IROWS, IRWRK, ITAU, IWRK, JC, JR, - $ LWKOPT + $ LWKOPT, LWKMIN REAL ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, $ SMLNUM, TEMP COMPLEX X @@ -253,13 +254,12 @@ * .. * .. External Subroutines .. EXTERNAL CGEQRF, CGGBAK, CGGBAL, CGGHD3, CLAQZ0, CLACPY, - $ CLASCL, CLASET, CTGEVC, CUNGQR, CUNMQR, SLABAD, - $ XERBLA + $ CLASCL, CLASET, CTGEVC, CUNGQR, CUNMQR, XERBLA * .. * .. External Functions .. LOGICAL LSAME - REAL CLANGE, SLAMCH - EXTERNAL LSAME, CLANGE, SLAMCH + REAL CLANGE, SLAMCH, SROUNDUP_LWORK + EXTERNAL LSAME, CLANGE, SLAMCH, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, MAX, REAL, SQRT @@ -301,6 +301,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N ) IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -315,7 +316,7 @@ INFO = -11 ELSE IF( LDVR.LT.1 .OR. ( ILVR .AND. LDVR.LT.N ) ) THEN INFO = -13 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -15 END IF * @@ -323,7 +324,7 @@ * IF( INFO.EQ.0 ) THEN CALL CGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( N, N+INT( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, N+INT( WORK( 1 ) ) ) CALL CUNMQR( 'L', 'C', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) @@ -348,7 +349,11 @@ $ RWORK, 0, IERR ) LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) END IF - WORK( 1 ) = CMPLX( LWKOPT ) + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + END IF END IF * IF( INFO.NE.0 ) THEN @@ -368,7 +373,6 @@ EPS = SLAMCH( 'E' )*SLAMCH( 'B' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -549,7 +553,7 @@ IF( ILBSCL ) $ CALL CLASCL( 'G', 0, 0, BNRMTO, BNRM, N, 1, BETA, N, IERR ) * - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of CGGEV3 diff --git a/lapack-netlib/SRC/cgghd3.f b/lapack-netlib/SRC/cgghd3.f index 1074b4828..f7175a72c 100644 --- a/lapack-netlib/SRC/cgghd3.f +++ b/lapack-netlib/SRC/cgghd3.f @@ -180,14 +180,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension (LWORK) +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= 1. +*> The length of the array WORK. LWORK >= 1. *> For optimum performance LWORK >= 6*N*NB, where NB is the *> optimal blocksize. *> @@ -212,7 +212,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup gghd3 * *> \par Further Details: * ===================== @@ -265,7 +265,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CGGHRD, CLARTG, CLASET, CUNM22, CROT, CGEMM, @@ -280,8 +281,13 @@ * INFO = 0 NB = ILAENV( 1, 'CGGHD3', ' ', N, ILO, IHI, -1 ) - LWKOPT = MAX( 6*N*NB, 1 ) - WORK( 1 ) = CMPLX( LWKOPT ) + NH = IHI - ILO + 1 + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = 6*N*NB + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) INITQ = LSAME( COMPQ, 'I' ) WANTQ = INITQ .OR. LSAME( COMPQ, 'V' ) INITZ = LSAME( COMPZ, 'I' ) @@ -330,7 +336,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = CONE RETURN @@ -888,7 +893,8 @@ IF ( JCOL.LT.IHI ) $ CALL CGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q, $ LDQ, Z, LDZ, IERR ) - WORK( 1 ) = CMPLX( LWKOPT ) +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/cggqrf.f b/lapack-netlib/SRC/cggqrf.f index 29b0bf4af..309f170e8 100644 --- a/lapack-netlib/SRC/cggqrf.f +++ b/lapack-netlib/SRC/cggqrf.f @@ -251,8 +251,8 @@ NB2 = ILAENV( 1, 'CGERQF', ' ', N, P, -1, -1 ) NB3 = ILAENV( 1, 'CUNMQR', ' ', N, M, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P)*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN INFO = -1 @@ -288,7 +288,7 @@ * RQ factorization of N-by-P matrix B: B = T*Z. * CALL CGERQF( N, P, B, LDB, TAUB, WORK, LWORK, INFO ) - WORK( 1 ) = MAX( LOPT, INT( WORK( 1 ) ) ) + WORK( 1 ) = SROUNDUP_LWORK( MAX( LOPT, INT( WORK( 1 ) ) ) ) * RETURN * diff --git a/lapack-netlib/SRC/cggrqf.f b/lapack-netlib/SRC/cggrqf.f index 273ab3ef7..8470a1ce2 100644 --- a/lapack-netlib/SRC/cggrqf.f +++ b/lapack-netlib/SRC/cggrqf.f @@ -250,8 +250,8 @@ NB2 = ILAENV( 1, 'CGEQRF', ' ', P, N, -1, -1 ) NB3 = ILAENV( 1, 'CUNMRQ', ' ', M, N, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P)*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -288,7 +288,7 @@ * QR factorization of P-by-N matrix B: B = Z*T * CALL CGEQRF( P, N, B, LDB, TAUB, WORK, LWORK, INFO ) - WORK( 1 ) = MAX( LOPT, INT( WORK( 1 ) ) ) + WORK( 1 ) = SROUNDUP_LWORK( MAX( LOPT, INT( WORK( 1 ) ) ) ) * RETURN * diff --git a/lapack-netlib/SRC/cggsvd3.f b/lapack-netlib/SRC/cggsvd3.f index f248aebd5..4c4b85bae 100644 --- a/lapack-netlib/SRC/cggsvd3.f +++ b/lapack-netlib/SRC/cggsvd3.f @@ -278,7 +278,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -333,7 +333,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEsing +*> \ingroup ggsvd3 * *> \par Contributors: * ================== diff --git a/lapack-netlib/SRC/cggsvp3.f b/lapack-netlib/SRC/cggsvp3.f index 008a053a2..e19f7efd5 100644 --- a/lapack-netlib/SRC/cggsvp3.f +++ b/lapack-netlib/SRC/cggsvp3.f @@ -233,7 +233,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -256,7 +256,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup ggsvp3 * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/cheevd.f b/lapack-netlib/SRC/cheevd.f index b5ca804eb..9b62a2df6 100644 --- a/lapack-netlib/SRC/cheevd.f +++ b/lapack-netlib/SRC/cheevd.f @@ -116,8 +116,7 @@ *> *> \param[out] RWORK *> \verbatim -*> RWORK is REAL array, -*> dimension (LRWORK) +*> RWORK is REAL array, dimension (MAX(1,LRWORK)) *> On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK. *> \endverbatim *> @@ -282,8 +281,8 @@ LROPT = LRWMIN LIOPT = LIWMIN END IF - WORK( 1 ) = SROUNDUP_LWORK(LOPT) - RWORK( 1 ) = LROPT + WORK( 1 ) = SROUNDUP_LWORK( LOPT ) + RWORK( 1 ) = SROUNDUP_LWORK( LROPT ) IWORK( 1 ) = LIOPT * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -378,8 +377,8 @@ CALL SSCAL( IMAX, ONE / SIGMA, W, 1 ) END IF * - WORK( 1 ) = SROUNDUP_LWORK(LOPT) - RWORK( 1 ) = LROPT + WORK( 1 ) = SROUNDUP_LWORK( LOPT ) + RWORK( 1 ) = SROUNDUP_LWORK( LROPT ) IWORK( 1 ) = LIOPT * RETURN diff --git a/lapack-netlib/SRC/cheevr.f b/lapack-netlib/SRC/cheevr.f index 05c5e66be..ad5c8cd4a 100644 --- a/lapack-netlib/SRC/cheevr.f +++ b/lapack-netlib/SRC/cheevr.f @@ -272,7 +272,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= max(1,2*N). +*> The length of the array WORK. +*> If N <= 1, LWORK >= 1, else LWORK >= 2*N. *> For optimal efficiency, LWORK >= (NB+1)*N, *> where NB is the max of the blocksize for CHETRD and for *> CUNMTR as returned by ILAENV. @@ -294,7 +295,8 @@ *> \param[in] LRWORK *> \verbatim *> LRWORK is INTEGER -*> The length of the array RWORK. LRWORK >= max(1,24*N). +*> The length of the array RWORK. +*> If N <= 1, LRWORK >= 1, else LRWORK >= 24*N. *> *> If LRWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -313,7 +315,8 @@ *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -417,9 +420,15 @@ LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 ) .OR. $ ( LIWORK.EQ.-1 ) ) * - LRWMIN = MAX( 1, 24*N ) - LIWMIN = MAX( 1, 10*N ) - LWMIN = MAX( 1, 2*N ) + IF( N.LE.1 ) THEN + LWMIN = 1 + LRWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = 2*N + LRWMIN = 24*N + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN @@ -454,8 +463,8 @@ NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'CUNMTR', UPLO, N, -1, -1, -1 ) ) LWKOPT = MAX( ( NB+1 )*N, LWMIN ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) - RWORK( 1 ) = LRWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN ) IWORK( 1 ) = LIWMIN * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -483,7 +492,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 2 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = REAL( A( 1, 1 ) ) @@ -710,8 +719,8 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) - RWORK( 1 ) = LRWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN ) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/cheevr_2stage.f b/lapack-netlib/SRC/cheevr_2stage.f index 0332a09bc..e06925fcd 100644 --- a/lapack-netlib/SRC/cheevr_2stage.f +++ b/lapack-netlib/SRC/cheevr_2stage.f @@ -265,7 +265,7 @@ *> indicating the nonzero elements in Z. The i-th eigenvector *> is nonzero only in elements ISUPPZ( 2*i-1 ) through *> ISUPPZ( 2*i ). This is an output of CSTEMR (tridiagonal -*> matrix). The support of the eigenvectors of A is typically +*> matrix). The support of the eigenvectors of A is typically *> 1:N because of the unitary transformations applied by CUNMTR. *> Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1 *> \endverbatim @@ -279,12 +279,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. +*> If N <= 1, LWORK must be at least 1. *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, 26*N, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -310,7 +311,8 @@ *> \param[in] LRWORK *> \verbatim *> LRWORK is INTEGER -*> The length of the array RWORK. LRWORK >= max(1,24*N). +*> The length of the array RWORK. +*> If N <= 1, LRWORK >= 1, else LRWORK >= 24*N. *> *> If LRWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -329,7 +331,8 @@ *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -354,7 +357,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEeigen +*> \ingroup heevr_2stage * *> \par Contributors: * ================== @@ -382,7 +385,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -390,11 +393,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -443,8 +446,9 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV, ILAENV2STAGE - REAL SLAMCH, CLANSY - EXTERNAL LSAME, SLAMCH, CLANSY, ILAENV, ILAENV2STAGE + REAL SLAMCH, CLANSY, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, CLANSY, ILAENV, ILAENV2STAGE, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SCOPY, SSCAL, SSTEBZ, SSTERF, XERBLA, CSSCAL, @@ -472,9 +476,16 @@ IB = ILAENV2STAGE( 2, 'CHETRD_2STAGE', JOBZ, N, KD, -1, -1 ) LHTRD = ILAENV2STAGE( 3, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) LWTRD = ILAENV2STAGE( 4, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) - LWMIN = N + LHTRD + LWTRD - LRWMIN = MAX( 1, 24*N ) - LIWMIN = MAX( 1, 10*N ) +* + IF( N.LE.1 ) THEN + LWMIN = 1 + LRWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = N + LHTRD + LWTRD + LRWMIN = 24*N + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN @@ -506,8 +517,8 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LWMIN - RWORK( 1 ) = LRWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) + RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN ) IWORK( 1 ) = LIWMIN * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -535,7 +546,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 2 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = REAL( A( 1, 1 ) ) @@ -643,9 +654,9 @@ * * Call CHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form. * - CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ), + CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ), $ RWORK( INDRE ), WORK( INDTAU ), - $ WORK( INDHOUS ), LHTRD, + $ WORK( INDHOUS ), LHTRD, $ WORK( INDWK ), LLWORK, IINFO ) * * If all eigenvalues are desired @@ -666,7 +677,7 @@ CALL SCOPY( N-1, RWORK( INDRE ), 1, RWORK( INDREE ), 1 ) CALL SCOPY( N, RWORK( INDRD ), 1, RWORK( INDRDD ), 1 ) * - IF (ABSTOL .LE. TWO*N*EPS) THEN + IF ( ABSTOL .LE. TWO*N*EPS ) THEN TRYRAC = .TRUE. ELSE TRYRAC = .FALSE. @@ -765,8 +776,8 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWMIN - RWORK( 1 ) = LRWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) + RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN ) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/cheevx.f b/lapack-netlib/SRC/cheevx.f index e91599a44..a8a2bde63 100644 --- a/lapack-netlib/SRC/cheevx.f +++ b/lapack-netlib/SRC/cheevx.f @@ -348,14 +348,14 @@ IF( INFO.EQ.0 ) THEN IF( N.LE.1 ) THEN LWKMIN = 1 - WORK( 1 ) = LWKMIN + LWKOPT = 1 ELSE LWKMIN = 2*N NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'CUNMTR', UPLO, N, -1, -1, -1 ) ) - LWKOPT = MAX( 1, ( NB + 1 )*N ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT = ( NB + 1 )*N END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) $ INFO = -17 diff --git a/lapack-netlib/SRC/chesv_aa.f b/lapack-netlib/SRC/chesv_aa.f index 53ecc0a16..0f41c9332 100644 --- a/lapack-netlib/SRC/chesv_aa.f +++ b/lapack-netlib/SRC/chesv_aa.f @@ -177,7 +177,7 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER LWKOPT, LWKOPT_HETRF, LWKOPT_HETRS + INTEGER LWKMIN, LWKOPT, LWKOPT_HETRF, LWKOPT_HETRS * .. * .. External Functions .. LOGICAL LSAME @@ -197,6 +197,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N, 3*N-2 ) IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -207,18 +208,18 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX( 2*N, 3*N-2 ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF * IF( INFO.EQ.0 ) THEN CALL CHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT_HETRF = INT( WORK(1) ) + LWKOPT_HETRF = INT( WORK( 1 ) ) CALL CHETRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, $ -1, INFO ) - LWKOPT_HETRS = INT( WORK(1) ) - LWKOPT = MAX( LWKOPT_HETRF, LWKOPT_HETRS ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT_HETRS = INT( WORK( 1 ) ) + LWKOPT = MAX( LWKMIN, LWKOPT_HETRF, LWKOPT_HETRS ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -240,7 +241,7 @@ * END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/chesv_aa_2stage.f b/lapack-netlib/SRC/chesv_aa_2stage.f index 12950c4af..05ebd9253 100644 --- a/lapack-netlib/SRC/chesv_aa_2stage.f +++ b/lapack-netlib/SRC/chesv_aa_2stage.f @@ -99,14 +99,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is COMPLEX array, dimension (LTB) +*> TB is COMPLEX array, dimension (MAX(1,LTB)). *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -146,14 +146,15 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX workspace of size LWORK +*> WORK is COMPLEX workspace of size (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used to +*> select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -203,7 +204,7 @@ * * .. Local Scalars .. LOGICAL UPPER, TQUERY, WQUERY - INTEGER LWKOPT + INTEGER LWKMIN, LWKOPT * .. * .. External Functions .. LOGICAL LSAME @@ -225,6 +226,7 @@ UPPER = LSAME( UPLO, 'U' ) WQUERY = ( LWORK.EQ.-1 ) TQUERY = ( LTB.EQ.-1 ) + LWKMIN = MAX( 1, N ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -233,18 +235,19 @@ INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 - ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 - ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.WQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL CHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) - LWKOPT = INT( WORK(1) ) + LWKOPT = MAX( LWKMIN, INT( WORK( 1 ) ) ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -254,7 +257,6 @@ RETURN END IF * -* * Compute the factorization A = U**H*T*U or A = L*T*L**H. * CALL CHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2, @@ -268,7 +270,7 @@ * END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/chesvx.f b/lapack-netlib/SRC/chesvx.f index c23a35ce7..bdaad55ec 100644 --- a/lapack-netlib/SRC/chesvx.f +++ b/lapack-netlib/SRC/chesvx.f @@ -307,7 +307,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, NOFACT - INTEGER LWKOPT, NB + INTEGER LWKMIN, LWKOPT, NB REAL ANORM * .. * .. External Functions .. @@ -329,6 +329,7 @@ INFO = 0 NOFACT = LSAME( FACT, 'N' ) LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N ) IF( .NOT.NOFACT .AND. .NOT.LSAME( FACT, 'F' ) ) THEN INFO = -1 ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) @@ -346,17 +347,17 @@ INFO = -11 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -13 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -18 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = MAX( 1, 2*N ) + LWKOPT = LWKMIN IF( NOFACT ) THEN NB = ILAENV( 1, 'CHETRF', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( LWKOPT, N*NB ) END IF - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -405,7 +406,7 @@ IF( RCOND.LT.SLAMCH( 'Epsilon' ) ) $ INFO = N + 1 * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/chetrd_2stage.f b/lapack-netlib/SRC/chetrd_2stage.f index f5ad35f27..ec7075798 100644 --- a/lapack-netlib/SRC/chetrd_2stage.f +++ b/lapack-netlib/SRC/chetrd_2stage.f @@ -4,23 +4,23 @@ * * =========== DOCUMENTATION =========== * -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ * *> \htmlonly -*> Download CHETRD_2STAGE + dependencies -*> -*> [TGZ] -*> -*> [ZIP] -*> +*> Download CHETRD_2STAGE + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> *> [TXT] -*> \endhtmlonly +*> \endhtmlonly * * Definition: * =========== * -* SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, +* SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, * HOUS2, LHOUS2, WORK, LWORK, INFO ) * * IMPLICIT NONE @@ -34,7 +34,7 @@ * COMPLEX A( LDA, * ), TAU( * ), * HOUS2( * ), WORK( * ) * .. -* +* * *> \par Purpose: * ============= @@ -52,11 +52,11 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> in particular for the second stage (Band to *> tridiagonal) and thus LHOUS2 is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate Q1 Q2 or to apply Q1 Q2, +*> = 'V': the Householder representation is needed to +*> either generate Q1 Q2 or to apply Q1 Q2, *> then LHOUS2 is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -86,7 +86,7 @@ *> triangular part of A is not referenced. *> On exit, if UPLO = 'U', the band superdiagonal *> of A are overwritten by the corresponding elements of the -*> internal band-diagonal matrix AB, and the elements above +*> internal band-diagonal matrix AB, and the elements above *> the KD superdiagonal, with the array TAU, represent the unitary *> matrix Q1 as a product of elementary reflectors; if UPLO *> = 'L', the diagonal and band subdiagonal of A are over- @@ -117,13 +117,13 @@ *> \param[out] TAU *> \verbatim *> TAU is COMPLEX array, dimension (N-KD) -*> The scalar factors of the elementary reflectors of +*> The scalar factors of the elementary reflectors of *> the first stage (see Further Details). *> \endverbatim *> *> \param[out] HOUS2 *> \verbatim -*> HOUS2 is COMPLEX array, dimension (LHOUS2) +*> HOUS2 is COMPLEX array, dimension (MAX(1,LHOUS2)) *> Stores the Householder representation of the stage2 *> band to tridiagonal. *> \endverbatim @@ -132,6 +132,8 @@ *> \verbatim *> LHOUS2 is INTEGER *> The dimension of the array HOUS2. +*> LHOUS2 >= 1. +*> *> If LWORK = -1, or LHOUS2=-1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS2 array, returns @@ -143,13 +145,16 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension (LWORK) +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) +*> The dimension of the array WORK. +*> If N = 0, LWORK >= 1, else LWORK = MAX(1, dimension). +*> *> If LWORK = -1, or LHOUS2 = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -157,9 +162,9 @@ *> message related to LWORK is issued by XERBLA. *> LWORK = MAX(1, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) -*> + (KD+1)*N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ *> algorithm, usually FACTOPTNB=128 is a good choice @@ -177,12 +182,12 @@ * Authors: * ======== * -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. * -*> \ingroup complexHEcomputational +*> \ingroup hetrd_2stage * *> \par Further Details: * ===================== @@ -202,7 +207,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -210,16 +215,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, + SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, $ HOUS2, LHOUS2, WORK, LWORK, INFO ) * IMPLICIT NONE @@ -250,7 +255,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV2STAGE - EXTERNAL LSAME, ILAENV2STAGE + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV2STAGE, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -265,10 +271,13 @@ * KD = ILAENV2STAGE( 1, 'CHETRD_2STAGE', VECT, N, -1, -1, -1 ) IB = ILAENV2STAGE( 2, 'CHETRD_2STAGE', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 ) -* WRITE(*,*),'CHETRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO, -* $ LHMIN, LWMIN + IF( N.EQ.0 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.LSAME( VECT, 'N' ) ) THEN INFO = -1 @@ -285,8 +294,8 @@ END IF * IF( INFO.EQ.0 ) THEN - HOUS2( 1 ) = LHMIN - WORK( 1 ) = LWMIN + HOUS2( 1 ) = SROUNDUP_LWORK( LHMIN ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -309,14 +318,14 @@ LWRK = LWORK-LDAB*N ABPOS = 1 WPOS = ABPOS + LDAB*N - CALL CHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, + CALL CHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, $ TAU, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'CHETRD_HE2HB', -INFO ) RETURN END IF - CALL CHETRD_HB2ST( 'Y', VECT, UPLO, N, KD, - $ WORK( ABPOS ), LDAB, D, E, + CALL CHETRD_HB2ST( 'Y', VECT, UPLO, N, KD, + $ WORK( ABPOS ), LDAB, D, E, $ HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'CHETRD_HB2ST', -INFO ) @@ -324,8 +333,7 @@ END IF * * - HOUS2( 1 ) = LHMIN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CHETRD_2STAGE diff --git a/lapack-netlib/SRC/chetrd_hb2st.F b/lapack-netlib/SRC/chetrd_hb2st.F index 3688e40a3..b0d3e45fb 100644 --- a/lapack-netlib/SRC/chetrd_hb2st.F +++ b/lapack-netlib/SRC/chetrd_hb2st.F @@ -132,15 +132,17 @@ *> *> \param[out] HOUS *> \verbatim -*> HOUS is COMPLEX array, dimension LHOUS, that -*> store the Householder representation. +*> HOUS is COMPLEX array, dimension (MAX(1,LHOUS)) +*> Stores the Householder representation. *> \endverbatim *> *> \param[in] LHOUS *> \verbatim *> LHOUS is INTEGER -*> The dimension of the array HOUS. LHOUS = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array HOUS. +*> If N = 0 or KD <= 1, LHOUS >= 1, else LHOUS = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS array, returns *> this value as the first entry of the HOUS array, and no error @@ -152,14 +154,17 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension LWORK. +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)). +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array WORK. +*> If N = 0 or KD <= 1, LWORK >= 1, else LWORK = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -262,7 +267,7 @@ INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, $ ED, STIND, EDIND, BLKLASTIND, COLPT, THED, $ STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID, - $ NBTILES, TTYPE, TID, NTHREADS, DEBUG, + $ NBTILES, TTYPE, TID, NTHREADS, $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, $ INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU, $ SICEV, SIZETAU, LDV, LHMIN, LWMIN @@ -286,7 +291,6 @@ * Determine the minimal workspace size required. * Test the input parameters * - DEBUG = 0 INFO = 0 AFTERS1 = LSAME( STAGE1, 'Y' ) WANTQ = LSAME( VECT, 'V' ) @@ -295,9 +299,14 @@ * * Determine the block size, the workspace size and the hous size. * - IB = ILAENV2STAGE( 2, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) + IB = ILAENV2STAGE( 2, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 ) + IF( N.EQ.0 .OR. KD.LE.1 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN INFO = -1 @@ -318,8 +327,8 @@ END IF * IF( INFO.EQ.0 ) THEN - HOUS( 1 ) = LHMIN - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + HOUS( 1 ) = SROUNDUP_LWORK( LHMIN ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -575,8 +584,7 @@ C END IF 170 CONTINUE ENDIF * - HOUS( 1 ) = LHMIN - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CHETRD_HB2ST diff --git a/lapack-netlib/SRC/chetrd_he2hb.f b/lapack-netlib/SRC/chetrd_he2hb.f index 090f02100..42e71e0b2 100644 --- a/lapack-netlib/SRC/chetrd_he2hb.f +++ b/lapack-netlib/SRC/chetrd_he2hb.f @@ -123,8 +123,8 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension (LWORK) -*> On exit, if INFO = 0, or if LWORK=-1, +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, or if LWORK = -1, *> WORK(1) returns the size of LWORK. *> \endverbatim *> @@ -132,7 +132,9 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK which should be calculated -*> by a workspace query. LWORK = MAX(1, LWORK_QUERY) +*> by a workspace query. +*> If N <= KD+1, LWORK >= 1, else LWORK = MAX(1, LWORK_QUERY). +*> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -294,8 +296,12 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) - LWMIN = ILAENV2STAGE( 4, 'CHETRD_HE2HB', '', N, KD, -1, -1 ) - + IF( N.LE.KD+1 ) THEN + LWMIN = 1 + ELSE + LWMIN = ILAENV2STAGE( 4, 'CHETRD_HE2HB', '', N, KD, -1, -1 ) + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -314,7 +320,7 @@ CALL XERBLA( 'CHETRD_HE2HB', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN END IF * @@ -507,7 +513,7 @@ END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CHETRD_HE2HB diff --git a/lapack-netlib/SRC/chetrf.f b/lapack-netlib/SRC/chetrf.f index 0c596ffe7..2836e30bc 100644 --- a/lapack-netlib/SRC/chetrf.f +++ b/lapack-netlib/SRC/chetrf.f @@ -107,7 +107,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> \endverbatim *> @@ -228,8 +228,8 @@ * Determine the block size * NB = ILAENV( 1, 'CHETRF', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT = MAX( 1, N*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -347,7 +347,7 @@ END IF * 40 CONTINUE - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of CHETRF diff --git a/lapack-netlib/SRC/chetrf_aa.f b/lapack-netlib/SRC/chetrf_aa.f index 0547a4eab..51410a6ed 100644 --- a/lapack-netlib/SRC/chetrf_aa.f +++ b/lapack-netlib/SRC/chetrf_aa.f @@ -101,8 +101,10 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= 2*N. For optimum performance -*> LWORK >= N*(1+NB), where NB is the optimal blocksize. +*> The length of WORK. +*> LWORK >= 1, if N <= 1, and LWORK >= 2*N, otherwise. +*> For optimum performance LWORK >= N*(1+NB), where NB is +*> the optimal blocksize, returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -128,7 +130,7 @@ *> \ingroup hetrf_aa * * ===================================================================== - SUBROUTINE CHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO) + SUBROUTINE CHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -152,7 +154,7 @@ * * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER J, LWKOPT + INTEGER J, LWKMIN, LWKOPT INTEGER NB, MJ, NJ, K1, K2, J1, J2, J3, JB COMPLEX ALPHA * .. @@ -179,19 +181,26 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( N.LE.1 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = 2*N + LWKOPT = (NB+1)*N + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.( 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = (NB+1)*N - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -203,11 +212,11 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF IPIV( 1 ) = 1 - IF ( N.EQ.1 ) THEN + IF( N.EQ.1 ) THEN A( 1, 1 ) = REAL( A( 1, 1 ) ) RETURN END IF @@ -460,7 +469,7 @@ END IF * 20 CONTINUE - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of CHETRF_AA diff --git a/lapack-netlib/SRC/chetrf_aa_2stage.f b/lapack-netlib/SRC/chetrf_aa_2stage.f index 400efdf26..a79343753 100644 --- a/lapack-netlib/SRC/chetrf_aa_2stage.f +++ b/lapack-netlib/SRC/chetrf_aa_2stage.f @@ -87,14 +87,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is COMPLEX array, dimension (LTB) +*> TB is COMPLEX array, dimension (MAX(1,LTB)) *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -121,14 +121,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX workspace of size LWORK +*> WORK is COMPLEX workspace of size (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used +*> to select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -152,7 +152,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYcomputational +*> \ingroup hetrf_aa_2stage * * ===================================================================== SUBROUTINE CHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, @@ -188,7 +188,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. @@ -213,9 +214,9 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LTB .LT. 4*N .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -6 - ELSE IF ( LWORK .LT. N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.WQUERY ) THEN INFO = -10 END IF * @@ -229,10 +230,10 @@ NB = ILAENV( 1, 'CHETRF_AA_2STAGE', UPLO, N, -1, -1, -1 ) IF( INFO.EQ.0 ) THEN IF( TQUERY ) THEN - TB( 1 ) = (3*NB+1)*N + TB( 1 ) = SROUNDUP_LWORK( MAX( 1, (3*NB+1)*N ) ) END IF IF( WQUERY ) THEN - WORK( 1 ) = N*NB + WORK( 1 ) = SROUNDUP_LWORK( MAX( 1, N*NB ) ) END IF END IF IF( TQUERY .OR. WQUERY ) THEN @@ -241,7 +242,7 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF * diff --git a/lapack-netlib/SRC/chetrf_rk.f b/lapack-netlib/SRC/chetrf_rk.f index ef442c937..a13c740e3 100644 --- a/lapack-netlib/SRC/chetrf_rk.f +++ b/lapack-netlib/SRC/chetrf_rk.f @@ -177,14 +177,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension ( MAX(1,LWORK) ). +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned *> by ILAENV. *> @@ -311,8 +311,8 @@ * Determine the block size * NB = ILAENV( 1, 'CHETRF_RK', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT = MAX( 1, N*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -488,7 +488,7 @@ * END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of CHETRF_RK diff --git a/lapack-netlib/SRC/chetrf_rook.f b/lapack-netlib/SRC/chetrf_rook.f index 1593c2edc..df0323520 100644 --- a/lapack-netlib/SRC/chetrf_rook.f +++ b/lapack-netlib/SRC/chetrf_rook.f @@ -122,7 +122,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine @@ -264,7 +264,7 @@ * NB = ILAENV( 1, 'CHETRF_ROOK', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, N*NB ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -387,7 +387,7 @@ END IF * 40 CONTINUE - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of CHETRF_ROOK diff --git a/lapack-netlib/SRC/chetri2.f b/lapack-netlib/SRC/chetri2.f index 2865a6440..f15065ae7 100644 --- a/lapack-netlib/SRC/chetri2.f +++ b/lapack-netlib/SRC/chetri2.f @@ -88,16 +88,16 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension (N+NB+1)*(NB+3) +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> WORK is size >= (N+NB+1)*(NB+3) +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> If LWORK = -1, then a workspace query is assumed; the routine -*> calculates: +*> calculates: *> - the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, *> - and no error message related to LWORK is issued by XERBLA. @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEcomputational +*> \ingroup hetri2 * * ===================================================================== SUBROUTINE CHETRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -147,7 +147,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CHETRI2X, CHETRI, XERBLA @@ -159,9 +160,13 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) +* * Get blocksize +* NBMAX = ILAENV( 1, 'CHETRF', UPLO, N, -1, -1, -1 ) - IF ( NBMAX .GE. N ) THEN + IF( N.EQ.0 ) THEN + MINSIZE = 1 + ELSE IF( NBMAX.GE.N ) THEN MINSIZE = N ELSE MINSIZE = (N+NBMAX+1)*(NBMAX+3) @@ -173,28 +178,29 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF (LWORK .LT. MINSIZE .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.MINSIZE .AND. .NOT.LQUERY ) THEN INFO = -7 END IF -* -* Quick return if possible -* * IF( INFO.NE.0 ) THEN CALL XERBLA( 'CHETRI2', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK(1)=MINSIZE + WORK( 1 ) = SROUNDUP_LWORK( MINSIZE ) RETURN END IF +* +* Quick return if possible +* IF( N.EQ.0 ) $ RETURN - IF( NBMAX .GE. N ) THEN + IF( NBMAX.GE.N ) THEN CALL CHETRI( UPLO, N, A, LDA, IPIV, WORK, INFO ) ELSE CALL CHETRI2X( UPLO, N, A, LDA, IPIV, WORK, NBMAX, INFO ) END IF +* RETURN * * End of CHETRI2 diff --git a/lapack-netlib/SRC/chetri_3.f b/lapack-netlib/SRC/chetri_3.f index deda63598..ccfce5070 100644 --- a/lapack-netlib/SRC/chetri_3.f +++ b/lapack-netlib/SRC/chetri_3.f @@ -119,16 +119,17 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension (N+NB+1)*(NB+3). +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= (N+NB+1)*(NB+3). +*> The length of WORK. +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> -*> If LDWORK = -1, then a workspace query is assumed; +*> If LWORK = -1, then a workspace query is assumed; *> the routine only calculates the optimal size of the optimal *> size of the WORK array, returns this value as the first *> entry of the WORK array, and no error message related to @@ -209,8 +210,13 @@ * * Determine the block size * - NB = MAX( 1, ILAENV( 1, 'CHETRI_3', UPLO, N, -1, -1, -1 ) ) - LWKOPT = ( N+NB+1 ) * ( NB+3 ) + IF( N.EQ.0 ) THEN + LWKOPT = 1 + ELSE + NB = MAX( 1, ILAENV( 1, 'CHETRI_3', UPLO, N, -1, -1, -1 ) ) + LWKOPT = ( N+NB+1 ) * ( NB+3 ) + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 @@ -218,7 +224,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LWORK .LT. LWKOPT .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN INFO = -8 END IF * @@ -226,7 +232,6 @@ CALL XERBLA( 'CHETRI_3', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN END IF * @@ -237,7 +242,7 @@ * CALL CHETRI_3X( UPLO, N, A, LDA, E, IPIV, WORK, NB, INFO ) * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/chetrs_aa.f b/lapack-netlib/SRC/chetrs_aa.f index 879549106..07179ab92 100644 --- a/lapack-netlib/SRC/chetrs_aa.f +++ b/lapack-netlib/SRC/chetrs_aa.f @@ -105,7 +105,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,3*N-2). +*> The dimension of the array WORK. +*> If MIN(N,NRHS) = 0, LWORK >= 1, else LWORK >= 3*N-2. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the minimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. *> \endverbatim *> *> \param[out] INFO @@ -151,24 +157,30 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER K, KP, LWKOPT + INTEGER K, KP, LWKMIN * .. * .. External Functions .. LOGICAL LSAME REAL SROUNDUP_LWORK - EXTERNAL LSAME,SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLACPY, CLACGV, CGTSV, CSWAP, CTRSM, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX + INTRINSIC MIN, MAX * .. * .. Executable Statements .. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWKMIN = 1 + ELSE + LWKMIN = 3*N-2 + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -179,21 +191,20 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX( 1, 3*N-2 ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CHETRS_AA', -INFO ) RETURN ELSE IF( LQUERY ) THEN - LWKOPT = (3*N-2) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKMIN ) RETURN END IF * * Quick return if possible * - IF( N.EQ.0 .OR. NRHS.EQ.0 ) + IF( MIN( N, NRHS ).EQ.0 ) $ RETURN * IF( UPPER ) THEN diff --git a/lapack-netlib/SRC/clamswlq.f b/lapack-netlib/SRC/clamswlq.f index 5daf60bf6..8f474a3ab 100644 --- a/lapack-netlib/SRC/clamswlq.f +++ b/lapack-netlib/SRC/clamswlq.f @@ -127,17 +127,20 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> If SIDE = 'L', LWORK >= max(1,NB) * MB; -*> if SIDE = 'R', LWORK >= max(1,M) * MB. +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,NB*MB). +*> If SIDE = 'R', LWORK >= max(1,M*MB). +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -193,91 +196,100 @@ *> * ===================================================================== SUBROUTINE CLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - COMPLEX A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + COMPLEX A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME REAL SROUNDUP_LWORK EXTERNAL LSAME, SROUNDUP_LWORK +* .. * .. External Subroutines .. - EXTERNAL CTPMLQT, CGEMLQT, XERBLA + EXTERNAL CTPMLQT, CGEMLQT, XERBLA * .. * .. Executable Statements .. * * Test the input arguments * - LQUERY = LWORK.LT.0 + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * MB ELSE LW = M * MB END IF * - INFO = 0 + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF +* IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( K.LT.0 ) THEN INFO = -5 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN INFO = -4 - ELSE IF( K.LT.MB .OR. MB.LT.1) THEN + ELSE IF( K.LT.MB .OR. MB.LT.1 ) THEN INFO = -6 ELSE IF( LDA.LT.MAX( 1, K ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, MB) ) THEN + ELSE IF( LDT.LT.MAX( 1, MB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) + END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CLAMSWLQ', -INFO ) - WORK(1) = SROUNDUP_LWORK(LW) RETURN - ELSE IF (LQUERY) THEN - WORK(1) = SROUNDUP_LWORK(LW) + ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * IF((NB.LE.K).OR.(NB.GE.MAX(M,N,K))) THEN CALL CGEMLQT( SIDE, TRANS, M, N, K, MB, A, LDA, - $ T, LDT, C, LDC, WORK, INFO) + $ T, LDT, C, LDC, WORK, INFO ) RETURN END IF * @@ -404,7 +416,7 @@ * END IF * - WORK(1) = SROUNDUP_LWORK(LW) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CLAMSWLQ diff --git a/lapack-netlib/SRC/clamtsqr.f b/lapack-netlib/SRC/clamtsqr.f index 05021e642..13625087f 100644 --- a/lapack-netlib/SRC/clamtsqr.f +++ b/lapack-netlib/SRC/clamtsqr.f @@ -128,22 +128,24 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) -*> +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,N*NB). +*> If SIDE = 'R', LWORK >= max(1,MB*NB). *> -*> If SIDE = 'L', LWORK >= max(1,N)*NB; -*> if SIDE = 'R', LWORK >= max(1,MB)*NB. *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -195,45 +197,47 @@ *> * ===================================================================== SUBROUTINE CLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - COMPLEX A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + COMPLEX A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR, Q + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, Q, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME REAL SROUNDUP_LWORK EXTERNAL LSAME, SROUNDUP_LWORK +* .. * .. External Subroutines .. - EXTERNAL CGEMQRT, CTPMQRT, XERBLA + EXTERNAL CGEMQRT, CTPMQRT, XERBLA * .. * .. Executable Statements .. * * Test the input arguments * - LQUERY = LWORK.LT.0 + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * NB Q = M ELSE @@ -241,11 +245,17 @@ Q = N END IF * - INFO = 0 + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF +* IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN @@ -256,38 +266,38 @@ INFO = -7 ELSE IF( LDA.LT.MAX( 1, Q ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, NB) ) THEN + ELSE IF( LDT.LT.MAX( 1, NB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * -* Determine the block size if it is tall skinny or short and wide -* - IF( INFO.EQ.0) THEN - WORK(1) = SROUNDUP_LWORK(LW) + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'CLAMTSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF +* +* Determine the block size if it is tall skinny or short and wide * IF((MB.LE.K).OR.(MB.GE.MAX(M,N,K))) THEN CALL CGEMQRT( SIDE, TRANS, M, N, K, NB, A, LDA, - $ T, LDT, C, LDC, WORK, INFO) + $ T, LDT, C, LDC, WORK, INFO ) RETURN - END IF + END IF * IF(LEFT.AND.NOTRAN) THEN * @@ -412,7 +422,7 @@ * END IF * - WORK(1) = SROUNDUP_LWORK(LW) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CLAMTSQR diff --git a/lapack-netlib/SRC/clarfgp.f b/lapack-netlib/SRC/clarfgp.f index 47b5e47b0..980e93612 100644 --- a/lapack-netlib/SRC/clarfgp.f +++ b/lapack-netlib/SRC/clarfgp.f @@ -148,33 +148,23 @@ ALPHR = REAL( ALPHA ) ALPHI = AIMAG( ALPHA ) * - IF( XNORM.LE.EPS*ABS(ALPHA) ) THEN + IF( XNORM.LE.EPS*ABS(ALPHA) .AND. ALPHI.EQ.ZERO ) THEN * * H = [1-alpha/abs(alpha) 0; 0 I], sign chosen so ALPHA >= 0. * - IF( ALPHI.EQ.ZERO ) THEN - IF( ALPHR.GE.ZERO ) THEN -* When TAU.eq.ZERO, the vector is special-cased to be -* all zeros in the application routines. We do not need -* to clear it. - TAU = ZERO - ELSE -* However, the application routines rely on explicit -* zero checks when TAU.ne.ZERO, and we must clear X. - TAU = TWO - DO J = 1, N-1 - X( 1 + (J-1)*INCX ) = ZERO - END DO - ALPHA = -ALPHA - END IF + IF( ALPHR.GE.ZERO ) THEN +* When TAU.eq.ZERO, the vector is special-cased to be +* all zeros in the application routines. We do not need +* to clear it. + TAU = ZERO ELSE -* Only "reflecting" the diagonal entry to be real and non-negative. - XNORM = SLAPY2( ALPHR, ALPHI ) - TAU = CMPLX( ONE - ALPHR / XNORM, -ALPHI / XNORM ) +* However, the application routines rely on explicit +* zero checks when TAU.ne.ZERO, and we must clear X. + TAU = TWO DO J = 1, N-1 X( 1 + (J-1)*INCX ) = ZERO END DO - ALPHA = XNORM + ALPHA = -ALPHA END IF ELSE * diff --git a/lapack-netlib/SRC/claswlq.f b/lapack-netlib/SRC/claswlq.f index 12e8373df..2044e055c 100644 --- a/lapack-netlib/SRC/claswlq.f +++ b/lapack-netlib/SRC/claswlq.f @@ -96,22 +96,24 @@ *> The leading dimension of the array T. LDT >= MB. *> \endverbatim *> -*> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) -*> +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= MB*M. +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MB*M, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -163,33 +165,35 @@ *> * ===================================================================== SUBROUTINE CLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, - $ INFO) + $ INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT + INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT * .. * .. Array Arguments .. - COMPLEX A( LDA, * ), WORK( * ), T( LDT, *) + COMPLEX A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, MINMN, LWMIN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME INTEGER ILAENV REAL SROUNDUP_LWORK EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK +* .. * .. EXTERNAL SUBROUTINES .. EXTERNAL CGELQT, CTPLQT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -200,12 +204,19 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = M*MB + END IF * IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. N.LT.M ) THEN INFO = -2 - ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 )) THEN + ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 ) ) THEN INFO = -3 ELSE IF( NB.LE.0 ) THEN INFO = -4 @@ -213,60 +224,61 @@ INFO = -6 ELSE IF( LDT.LT.MB ) THEN INFO = -8 - ELSE IF( ( LWORK.LT.M*MB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = SROUNDUP_LWORK(MB*M) +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'CLASWLQ', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The LQ Decomposition * - IF((M.GE.N).OR.(NB.LE.M).OR.(NB.GE.N)) THEN + IF( (M.GE.N) .OR. (NB.LE.M) .OR. (NB.GE.N) ) THEN CALL CGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO) RETURN - END IF + END IF * - KK = MOD((N-M),(NB-M)) - II=N-KK+1 + KK = MOD((N-M),(NB-M)) + II = N-KK+1 * -* Compute the LQ factorization of the first block A(1:M,1:NB) +* Compute the LQ factorization of the first block A(1:M,1:NB) * - CALL CGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO) - CTR = 1 + CALL CGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO) + CTR = 1 * - DO I = NB+1, II-NB+M , (NB-M) + DO I = NB+1, II-NB+M , (NB-M) * -* Compute the QR factorization of the current block A(1:M,I:I+NB-M) +* Compute the QR factorization of the current block A(1:M,I:I+NB-M) * - CALL CTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), + CALL CTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), $ LDA, T(1,CTR*M+1), $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CTR = CTR + 1 + END DO * * Compute the QR factorization of the last block A(1:M,II:N) * - IF (II.LE.N) THEN + IF( II.LE.N ) THEN CALL CTPLQT( M, KK, 0, MB, A(1,1), LDA, A( 1, II ), $ LDA, T(1,CTR*M+1), LDT, $ WORK, INFO ) - END IF + END IF * - WORK( 1 ) = SROUNDUP_LWORK(M * MB) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CLASWLQ diff --git a/lapack-netlib/SRC/clatrs3.f b/lapack-netlib/SRC/clatrs3.f index 0502f6898..354141a8b 100644 --- a/lapack-netlib/SRC/clatrs3.f +++ b/lapack-netlib/SRC/clatrs3.f @@ -152,13 +152,17 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension (LWORK). +*> WORK is REAL array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal size of *> WORK. *> \endverbatim *> *> \param[in] LWORK +*> \verbatim *> LWORK is INTEGER +*> The dimension of the array WORK. +*> +*> If MIN(N,NRHS) = 0, LWORK >= 1, else *> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where *> NBA = (N + NB - 1)/NB and NB is the optimal block size. *> @@ -166,6 +170,7 @@ *> only calculates the optimal dimensions of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. +*> \endverbatim *> *> \param[out] INFO *> \verbatim @@ -182,7 +187,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERauxiliary +*> \ingroup latrs3 *> \par Further Details: * ===================== * \verbatim @@ -257,15 +262,16 @@ LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, - $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS, LWMIN REAL ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, $ SCAMIN, SMLNUM, TMAX * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, CLANGE, SLARMM - EXTERNAL ILAENV, LSAME, SLAMCH, CLANGE, SLARMM + REAL SLAMCH, CLANGE, SLARMM, SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SLAMCH, CLANGE, SLARMM, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLATRS, CSSCAL, XERBLA @@ -296,15 +302,24 @@ * row. WORK( I + KK * LDS ) is the scale factor of the vector * segment associated with the I-th block row and the KK-th vector * in the block column. +* LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) LDS = NBA +* * The second part stores upper bounds of the triangular A. There are * a total of NBA x NBA blocks, of which only the upper triangular * part or the lower triangular part is referenced. The upper bound of * the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). +* LANRM = NBA * NBA AWRK = LSCALE - WORK( 1 ) = LSCALE + LANRM +* + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = LSCALE + LANRM + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * * Test the input parameters. * @@ -326,7 +341,7 @@ INFO = -8 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -10 - ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.LWMIN ) THEN INFO = -14 END IF IF( INFO.NE.0 ) THEN @@ -659,6 +674,9 @@ END IF END DO END DO +* + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) +* RETURN * * End of CLATRS3 diff --git a/lapack-netlib/SRC/clatsqr.f b/lapack-netlib/SRC/clatsqr.f index cd2cb4aa7..67403693f 100644 --- a/lapack-netlib/SRC/clatsqr.f +++ b/lapack-netlib/SRC/clatsqr.f @@ -101,15 +101,18 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= NB*N. +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= NB*N, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -165,32 +168,34 @@ *> * ===================================================================== SUBROUTINE CLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, - $ LWORK, INFO) + $ LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK + INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK * .. * .. Array Arguments .. - COMPLEX A( LDA, * ), WORK( * ), T(LDT, *) + COMPLEX A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, LWMIN, MINMN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME REAL SROUNDUP_LWORK EXTERNAL LSAME, SROUNDUP_LWORK +* .. * .. EXTERNAL SUBROUTINES .. - EXTERNAL CGEQRT, CTPQRT, XERBLA + EXTERNAL CGEQRT, CTPQRT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -201,6 +206,13 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = N*NB + END IF * IF( M.LT.0 ) THEN INFO = -1 @@ -208,64 +220,65 @@ INFO = -2 ELSE IF( MB.LT.1 ) THEN INFO = -3 - ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 )) THEN + ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 ) ) THEN INFO = -4 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -6 ELSE IF( LDT.LT.NB ) THEN INFO = -8 - ELSE IF( LWORK.LT.(N*NB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = SROUNDUP_LWORK(NB*N) +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CLATSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The QR Decomposition * - IF ((MB.LE.N).OR.(MB.GE.M)) THEN - CALL CGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO) - RETURN - END IF - KK = MOD((M-N),(MB-N)) - II=M-KK+1 + IF ( (MB.LE.N) .OR. (MB.GE.M) ) THEN + CALL CGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO ) + RETURN + END IF + KK = MOD((M-N),(MB-N)) + II = M-KK+1 * -* Compute the QR factorization of the first block A(1:MB,1:N) +* Compute the QR factorization of the first block A(1:MB,1:N) * - CALL CGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) - CTR = 1 + CALL CGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) + CTR = 1 * - DO I = MB+1, II-MB+N , (MB-N) + DO I = MB+1, II-MB+N, (MB-N) * -* Compute the QR factorization of the current block A(I:I+MB-N,1:N) +* Compute the QR factorization of the current block A(I:I+MB-N,1:N) * - CALL CTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, + CALL CTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, $ T(1,CTR * N + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * -* Compute the QR factorization of the last block A(II:M,1:N) +* Compute the QR factorization of the last block A(II:M,1:N) * - IF (II.LE.M) THEN - CALL CTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, + IF( II.LE.M ) THEN + CALL CTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, $ T(1, CTR * N + 1), LDT, - $ WORK, INFO ) - END IF + $ WORK, INFO ) + END IF * - WORK( 1 ) = SROUNDUP_LWORK(N*NB) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CLATSQR diff --git a/lapack-netlib/SRC/dbbcsd.f b/lapack-netlib/SRC/dbbcsd.f index 4fcf9ab5d..913f96a73 100644 --- a/lapack-netlib/SRC/dbbcsd.f +++ b/lapack-netlib/SRC/dbbcsd.f @@ -805,7 +805,7 @@ CALL DLARTGP( B22BULGE, B22E(I-1), WORK(IU2SN+I-1), $ WORK(IU2CS+I-1), R ) ELSE IF( NU .LT. MU ) THEN - CALL DLARTGS( B21E(I), B21E(I+1), NU, WORK(IU2CS+I-1), + CALL DLARTGS( B21E(I), B21D(I+1), NU, WORK(IU2CS+I-1), $ WORK(IU2SN+I-1) ) ELSE CALL DLARTGS( B22D(I), B22E(I), MU, WORK(IU2CS+I-1), diff --git a/lapack-netlib/SRC/dgebrd.f b/lapack-netlib/SRC/dgebrd.f index 0f0d1651a..ac11d48a0 100644 --- a/lapack-netlib/SRC/dgebrd.f +++ b/lapack-netlib/SRC/dgebrd.f @@ -122,7 +122,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= max(1,M,N). +*> The length of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(M,N), otherwise. *> For optimum performance LWORK >= (M+N)*NB, where NB *> is the optimal blocksize. *> @@ -147,7 +148,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup gebrd * *> \par Further Details: * ===================== @@ -223,8 +224,8 @@ * .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKOPT, MINMN, NB, - $ NBMIN, NX, WS + INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKMIN, LWKOPT, + $ MINMN, NB, NBMIN, NX, WS * .. * .. External Subroutines .. EXTERNAL DGEBD2, DGEMM, DLABRD, XERBLA @@ -241,9 +242,17 @@ * Test the input parameters * INFO = 0 - NB = MAX( 1, ILAENV( 1, 'DGEBRD', ' ', M, N, -1, -1 ) ) - LWKOPT = ( M+N )*NB + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = MAX( M, N ) + NB = MAX( 1, ILAENV( 1, 'DGEBRD', ' ', M, N, -1, -1 ) ) + LWKOPT = ( M+N )*NB + ENDIF WORK( 1 ) = DBLE( LWKOPT ) +* LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -251,7 +260,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.LT.0 ) THEN @@ -263,7 +272,6 @@ * * Quick return if possible * - MINMN = MIN( M, N ) IF( MINMN.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -282,7 +290,7 @@ * Determine when to switch from blocked to unblocked code. * IF( NX.LT.MINMN ) THEN - WS = ( M+N )*NB + WS = LWKOPT IF( LWORK.LT.WS ) THEN * * Not enough work space for the optimal NB, consider using diff --git a/lapack-netlib/SRC/dgedmd.f90 b/lapack-netlib/SRC/dgedmd.f90 index 20424808f..15df48fe9 100644 --- a/lapack-netlib/SRC/dgedmd.f90 +++ b/lapack-netlib/SRC/dgedmd.f90 @@ -1,424 +1,574 @@ - SUBROUTINE DGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & - M, N, X, LDX, Y, LDY, NRNK, TOL, & - K, REIG, IMEIG, Z, LDZ, RES, & - B, LDB, W, LDW, S, LDS, & - WORK, LWORK, IWORK, LIWORK, INFO ) -! March 2023 +!> \brief \b DGEDMD computes the Dynamic Mode Decomposition (DMD) for a pair of data snapshot matrices. +! +! =========== DOCUMENTATION =========== +! +! Definition: +! =========== +! +! SUBROUTINE DGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & +! M, N, X, LDX, Y, LDY, NRNK, TOL, & +! K, REIG, IMEIG, Z, LDZ, RES, & +! B, LDB, W, LDW, S, LDS, & +! WORK, LWORK, IWORK, LIWORK, INFO ) +! !..... - USE iso_fortran_env - IMPLICIT NONE - INTEGER, PARAMETER :: WP = real64 +! USE iso_fortran_env +! IMPLICIT NONE +! INTEGER, PARAMETER :: WP = real64 !..... ! Scalar arguments - CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF - INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & - NRNK, LDZ, LDB, LDW, LDS, & - LWORK, LIWORK - INTEGER, INTENT(OUT) :: K, INFO - REAL(KIND=WP), INTENT(IN) :: TOL +! CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF +! INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & +! NRNK, LDZ, LDB, LDW, LDS, & +! LWORK, LIWORK +! INTEGER, INTENT(OUT) :: K, INFO +! REAL(KIND=WP), INTENT(IN) :: TOL ! Array arguments - REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) - REAL(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & - W(LDW,*), S(LDS,*) - REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & - RES(*) - REAL(KIND=WP), INTENT(OUT) :: WORK(*) - INTEGER, INTENT(OUT) :: IWORK(*) -!............................................................ -! Purpose -! ======= -! DGEDMD computes the Dynamic Mode Decomposition (DMD) for -! a pair of data snapshot matrices. For the input matrices -! X and Y such that Y = A*X with an unaccessible matrix -! A, DGEDMD computes a certain number of Ritz pairs of A using -! the standard Rayleigh-Ritz extraction from a subspace of -! range(X) that is determined using the leading left singular -! vectors of X. Optionally, DGEDMD returns the residuals -! of the computed Ritz pairs, the information needed for -! a refinement of the Ritz vectors, or the eigenvectors of -! the Exact DMD. -! For further details see the references listed -! below. For more details of the implementation see [3]. -! -! References -! ========== -! [1] P. Schmid: Dynamic mode decomposition of numerical -! and experimental data, -! Journal of Fluid Mechanics 656, 5-28, 2010. -! [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal -! decompositions: analysis and enhancements, -! SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. -! [3] Z. Drmac: A LAPACK implementation of the Dynamic -! Mode Decomposition I. Technical report. AIMDyn Inc. -! and LAPACK Working Note 298. -! [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. -! Brunton, N. Kutz: On Dynamic Mode Decomposition: -! Theory and Applications, Journal of Computational -! Dynamics 1(2), 391 -421, 2014. +! REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) +! REAL(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & +! W(LDW,*), S(LDS,*) +! REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & +! RES(*) +! REAL(KIND=WP), INTENT(OUT) :: WORK(*) +! INTEGER, INTENT(OUT) :: IWORK(*) ! -!...................................................................... -! Developed and supported by: -! =========================== -! Developed and coded by Zlatko Drmac, Faculty of Science, -! University of Zagreb; drmac@math.hr -! In cooperation with -! AIMdyn Inc., Santa Barbara, CA. -! and supported by -! - DARPA SBIR project "Koopman Operator-Based Forecasting -! for Nonstationary Processes from Near-Term, Limited -! Observational Data" Contract No: W31P4Q-21-C-0007 -! - DARPA PAI project "Physics-Informed Machine Learning -! Methodologies" Contract No: HR0011-18-9-0033 -! - DARPA MoDyL project "A Data-Driven, Operator-Theoretic -! Framework for Space-Time Analysis of Process Dynamics" -! Contract No: HR0011-16-C-0116 -! Any opinions, findings and conclusions or recommendations -! expressed in this material are those of the author and -! do not necessarily reflect the views of the DARPA SBIR -! Program Office -!============================================================ -! Distribution Statement A: -! Approved for Public Release, Distribution Unlimited. -! Cleared by DARPA on September 29, 2022 -!============================================================ !............................................................ +!> \par Purpose: +! ============= +!> \verbatim +!> DGEDMD computes the Dynamic Mode Decomposition (DMD) for +!> a pair of data snapshot matrices. For the input matrices +!> X and Y such that Y = A*X with an unaccessible matrix +!> A, DGEDMD computes a certain number of Ritz pairs of A using +!> the standard Rayleigh-Ritz extraction from a subspace of +!> range(X) that is determined using the leading left singular +!> vectors of X. Optionally, DGEDMD returns the residuals +!> of the computed Ritz pairs, the information needed for +!> a refinement of the Ritz vectors, or the eigenvectors of +!> the Exact DMD. +!> For further details see the references listed +!> below. For more details of the implementation see [3]. +!> \endverbatim +!............................................................ +!> \par References: +! ================ +!> \verbatim +!> [1] P. Schmid: Dynamic mode decomposition of numerical +!> and experimental data, +!> Journal of Fluid Mechanics 656, 5-28, 2010. +!> [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal +!> decompositions: analysis and enhancements, +!> SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. +!> [3] Z. Drmac: A LAPACK implementation of the Dynamic +!> Mode Decomposition I. Technical report. AIMDyn Inc. +!> and LAPACK Working Note 298. +!> [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. +!> Brunton, N. Kutz: On Dynamic Mode Decomposition: +!> Theory and Applications, Journal of Computational +!> Dynamics 1(2), 391 -421, 2014. +!> \endverbatim +!...................................................................... +!> \par Developed and supported by: +! ================================ +!> \verbatim +!> Developed and coded by Zlatko Drmac, Faculty of Science, +!> University of Zagreb; drmac@math.hr +!> In cooperation with +!> AIMdyn Inc., Santa Barbara, CA. +!> and supported by +!> - DARPA SBIR project "Koopman Operator-Based Forecasting +!> for Nonstationary Processes from Near-Term, Limited +!> Observational Data" Contract No: W31P4Q-21-C-0007 +!> - DARPA PAI project "Physics-Informed Machine Learning +!> Methodologies" Contract No: HR0011-18-9-0033 +!> - DARPA MoDyL project "A Data-Driven, Operator-Theoretic +!> Framework for Space-Time Analysis of Process Dynamics" +!> Contract No: HR0011-16-C-0116 +!> Any opinions, findings and conclusions or recommendations +!> expressed in this material are those of the author and +!> do not necessarily reflect the views of the DARPA SBIR +!> Program Office +!> \endverbatim +!...................................................................... +!> \par Distribution Statement A: +! ============================== +!> \verbatim +!> Approved for Public Release, Distribution Unlimited. +!> Cleared by DARPA on September 29, 2022 +!> \endverbatim +!...................................................................... ! Arguments ! ========= -! JOBS (input) CHARACTER*1 -! Determines whether the initial data snapshots are scaled -! by a diagonal matrix. -! 'S' :: The data snapshots matrices X and Y are multiplied -! with a diagonal matrix D so that X*D has unit -! nonzero columns (in the Euclidean 2-norm) -! 'C' :: The snapshots are scaled as with the 'S' option. -! If it is found that an i-th column of X is zero -! vector and the corresponding i-th column of Y is -! non-zero, then the i-th column of Y is set to -! zero and a warning flag is raised. -! 'Y' :: The data snapshots matrices X and Y are multiplied -! by a diagonal matrix D so that Y*D has unit -! nonzero columns (in the Euclidean 2-norm) -! 'N' :: No data scaling. +! +!> \param[in] JOBS +!> \verbatim +!> JOBS (input) is CHARACTER*1 +!> Determines whether the initial data snapshots are scaled +!> by a diagonal matrix. +!> 'S' :: The data snapshots matrices X and Y are multiplied +!> with a diagonal matrix D so that X*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'C' :: The snapshots are scaled as with the 'S' option. +!> If it is found that an i-th column of X is zero +!> vector and the corresponding i-th column of Y is +!> non-zero, then the i-th column of Y is set to +!> zero and a warning flag is raised. +!> 'Y' :: The data snapshots matrices X and Y are multiplied +!> by a diagonal matrix D so that Y*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'N' :: No data scaling. +!> \endverbatim !..... -! JOBZ (input) CHARACTER*1 -! Determines whether the eigenvectors (Koopman modes) will -! be computed. -! 'V' :: The eigenvectors (Koopman modes) will be computed -! and returned in the matrix Z. -! See the description of Z. -! 'F' :: The eigenvectors (Koopman modes) will be returned -! in factored form as the product X(:,1:K)*W, where X -! contains a POD basis (leading left singular vectors -! of the data matrix X) and W contains the eigenvectors -! of the corresponding Rayleigh quotient. -! See the descriptions of K, X, W, Z. -! 'N' :: The eigenvectors are not computed. +!> \param[in] JOBZ +!> \verbatim +!> JOBZ (input) CHARACTER*1 +!> Determines whether the eigenvectors (Koopman modes) will +!> be computed. +!> 'V' :: The eigenvectors (Koopman modes) will be computed +!> and returned in the matrix Z. +!> See the description of Z. +!> 'F' :: The eigenvectors (Koopman modes) will be returned +!> in factored form as the product X(:,1:K)*W, where X +!> contains a POD basis (leading left singular vectors +!> of the data matrix X) and W contains the eigenvectors +!> of the corresponding Rayleigh quotient. +!> See the descriptions of K, X, W, Z. +!> 'N' :: The eigenvectors are not computed. +!> \endverbatim !..... -! JOBR (input) CHARACTER*1 -! Determines whether to compute the residuals. -! 'R' :: The residuals for the computed eigenpairs will be -! computed and stored in the array RES. -! See the description of RES. -! For this option to be legal, JOBZ must be 'V'. -! 'N' :: The residuals are not computed. +!> \param[in] JOBR +!> \verbatim +!> JOBR (input) CHARACTER*1 +!> Determines whether to compute the residuals. +!> 'R' :: The residuals for the computed eigenpairs will be +!> computed and stored in the array RES. +!> See the description of RES. +!> For this option to be legal, JOBZ must be 'V'. +!> 'N' :: The residuals are not computed. +!> \endverbatim !..... -! JOBF (input) CHARACTER*1 -! Specifies whether to store information needed for post- -! processing (e.g. computing refined Ritz vectors) -! 'R' :: The matrix needed for the refinement of the Ritz -! vectors is computed and stored in the array B. -! See the description of B. -! 'E' :: The unscaled eigenvectors of the Exact DMD are -! computed and returned in the array B. See the -! description of B. -! 'N' :: No eigenvector refinement data is computed. +!> \param[in] JOBF +!> \verbatim +!> JOBF (input) CHARACTER*1 +!> Specifies whether to store information needed for post- +!> processing (e.g. computing refined Ritz vectors) +!> 'R' :: The matrix needed for the refinement of the Ritz +!> vectors is computed and stored in the array B. +!> See the description of B. +!> 'E' :: The unscaled eigenvectors of the Exact DMD are +!> computed and returned in the array B. See the +!> description of B. +!> 'N' :: No eigenvector refinement data is computed. +!> \endverbatim !..... -! WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } -! Allows for a selection of the SVD algorithm from the -! LAPACK library. -! 1 :: DGESVD (the QR SVD algorithm) -! 2 :: DGESDD (the Divide and Conquer algorithm; if enough -! workspace available, this is the fastest option) -! 3 :: DGESVDQ (the preconditioned QR SVD ; this and 4 -! are the most accurate options) -! 4 :: DGEJSV (the preconditioned Jacobi SVD; this and 3 -! are the most accurate options) -! For the four methods above, a significant difference in -! the accuracy of small singular values is possible if -! the snapshots vary in norm so that X is severely -! ill-conditioned. If small (smaller than EPS*||X||) -! singular values are of interest and JOBS=='N', then -! the options (3, 4) give the most accurate results, where -! the option 4 is slightly better and with stronger -! theoretical background. -! If JOBS=='S', i.e. the columns of X will be normalized, -! then all methods give nearly equally accurate results. +!> \param[in] WHTSVD +!> \verbatim +!> WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } +!> Allows for a selection of the SVD algorithm from the +!> LAPACK library. +!> 1 :: DGESVD (the QR SVD algorithm) +!> 2 :: DGESDD (the Divide and Conquer algorithm; if enough +!> workspace available, this is the fastest option) +!> 3 :: DGESVDQ (the preconditioned QR SVD ; this and 4 +!> are the most accurate options) +!> 4 :: DGEJSV (the preconditioned Jacobi SVD; this and 3 +!> are the most accurate options) +!> For the four methods above, a significant difference in +!> the accuracy of small singular values is possible if +!> the snapshots vary in norm so that X is severely +!> ill-conditioned. If small (smaller than EPS*||X||) +!> singular values are of interest and JOBS=='N', then +!> the options (3, 4) give the most accurate results, where +!> the option 4 is slightly better and with stronger +!> theoretical background. +!> If JOBS=='S', i.e. the columns of X will be normalized, +!> then all methods give nearly equally accurate results. +!> \endverbatim !..... -! M (input) INTEGER, M>= 0 -! The state space dimension (the row dimension of X, Y). +!> \param[in] M +!> \verbatim +!> M (input) INTEGER, M>= 0 +!> The state space dimension (the row dimension of X, Y). +!> \endverbatim !..... -! N (input) INTEGER, 0 <= N <= M -! The number of data snapshot pairs -! (the number of columns of X and Y). +!> \param[in] N +!> \verbatim +!> N (input) INTEGER, 0 <= N <= M +!> The number of data snapshot pairs +!> (the number of columns of X and Y). +!> \endverbatim !..... -! X (input/output) REAL(KIND=WP) M-by-N array -! > On entry, X contains the data snapshot matrix X. It is -! assumed that the column norms of X are in the range of -! the normalized floating point numbers. -! < On exit, the leading K columns of X contain a POD basis, -! i.e. the leading K left singular vectors of the input -! data matrix X, U(:,1:K). All N columns of X contain all -! left singular vectors of the input matrix X. -! See the descriptions of K, Z and W. +!> \param[in,out] X +!> \verbatim +!> X (input/output) REAL(KIND=WP) M-by-N array +!> > On entry, X contains the data snapshot matrix X. It is +!> assumed that the column norms of X are in the range of +!> the normalized floating point numbers. +!> < On exit, the leading K columns of X contain a POD basis, +!> i.e. the leading K left singular vectors of the input +!> data matrix X, U(:,1:K). All N columns of X contain all +!> left singular vectors of the input matrix X. +!> See the descriptions of K, Z and W. +!> \endverbatim !..... -! LDX (input) INTEGER, LDX >= M -! The leading dimension of the array X. +!> \param[in] LDX +!> \verbatim +!> LDX (input) INTEGER, LDX >= M +!> The leading dimension of the array X. +!> \endverbatim !..... -! Y (input/workspace/output) REAL(KIND=WP) M-by-N array -! > On entry, Y contains the data snapshot matrix Y -! < On exit, -! If JOBR == 'R', the leading K columns of Y contain -! the residual vectors for the computed Ritz pairs. -! See the description of RES. -! If JOBR == 'N', Y contains the original input data, -! scaled according to the value of JOBS. +!> \param[in,out] Y +!> \verbatim +!> Y (input/workspace/output) REAL(KIND=WP) M-by-N array +!> > On entry, Y contains the data snapshot matrix Y +!> < On exit, +!> If JOBR == 'R', the leading K columns of Y contain +!> the residual vectors for the computed Ritz pairs. +!> See the description of RES. +!> If JOBR == 'N', Y contains the original input data, +!> scaled according to the value of JOBS. +!> \endverbatim !..... -! LDY (input) INTEGER , LDY >= M -! The leading dimension of the array Y. +!> \param[in] LDY +!> \verbatim +!> LDY (input) INTEGER , LDY >= M +!> The leading dimension of the array Y. +!> \endverbatim !..... -! NRNK (input) INTEGER -! Determines the mode how to compute the numerical rank, -! i.e. how to truncate small singular values of the input -! matrix X. On input, if -! NRNK = -1 :: i-th singular value sigma(i) is truncated -! if sigma(i) <= TOL*sigma(1). -! This option is recommended. -! NRNK = -2 :: i-th singular value sigma(i) is truncated -! if sigma(i) <= TOL*sigma(i-1) -! This option is included for R&D purposes. -! It requires highly accurate SVD, which -! may not be feasible. -! -! The numerical rank can be enforced by using positive -! value of NRNK as follows: -! 0 < NRNK <= N :: at most NRNK largest singular values -! will be used. If the number of the computed nonzero -! singular values is less than NRNK, then only those -! nonzero values will be used and the actually used -! dimension is less than NRNK. The actual number of -! the nonzero singular values is returned in the variable -! K. See the descriptions of TOL and K. +!> \param[in] NRNK +!> \verbatim +!> NRNK (input) INTEGER +!> Determines the mode how to compute the numerical rank, +!> i.e. how to truncate small singular values of the input +!> matrix X. On input, if +!> NRNK = -1 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(1). +!> This option is recommended. +!> NRNK = -2 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(i-1) +!> This option is included for R&D purposes. +!> It requires highly accurate SVD, which +!> may not be feasible. +!> +!> The numerical rank can be enforced by using positive +!> value of NRNK as follows: +!> 0 < NRNK <= N :: at most NRNK largest singular values +!> will be used. If the number of the computed nonzero +!> singular values is less than NRNK, then only those +!> nonzero values will be used and the actually used +!> dimension is less than NRNK. The actual number of +!> the nonzero singular values is returned in the variable +!> K. See the descriptions of TOL and K. +!> \endverbatim !..... -! TOL (input) REAL(KIND=WP), 0 <= TOL < 1 -! The tolerance for truncating small singular values. -! See the description of NRNK. +!> \param[in] TOL +!> \verbatim +!> TOL (input) REAL(KIND=WP), 0 <= TOL < 1 +!> The tolerance for truncating small singular values. +!> See the description of NRNK. +!> \endverbatim !..... -! K (output) INTEGER, 0 <= K <= N -! The dimension of the POD basis for the data snapshot -! matrix X and the number of the computed Ritz pairs. -! The value of K is determined according to the rule set -! by the parameters NRNK and TOL. -! See the descriptions of NRNK and TOL. +!> \param[out] K +!> \verbatim +!> K (output) INTEGER, 0 <= K <= N +!> The dimension of the POD basis for the data snapshot +!> matrix X and the number of the computed Ritz pairs. +!> The value of K is determined according to the rule set +!> by the parameters NRNK and TOL. +!> See the descriptions of NRNK and TOL. +!> \endverbatim !..... -! REIG (output) REAL(KIND=WP) N-by-1 array -! The leading K (K<=N) entries of REIG contain -! the real parts of the computed eigenvalues -! REIG(1:K) + sqrt(-1)*IMEIG(1:K). -! See the descriptions of K, IMEIG, and Z. +!> \param[out] REIG +!> \verbatim +!> REIG (output) REAL(KIND=WP) N-by-1 array +!> The leading K (K<=N) entries of REIG contain +!> the real parts of the computed eigenvalues +!> REIG(1:K) + sqrt(-1)*IMEIG(1:K). +!> See the descriptions of K, IMEIG, and Z. +!> \endverbatim !..... -! IMEIG (output) REAL(KIND=WP) N-by-1 array -! The leading K (K<=N) entries of IMEIG contain -! the imaginary parts of the computed eigenvalues -! REIG(1:K) + sqrt(-1)*IMEIG(1:K). -! The eigenvalues are determined as follows: -! If IMEIG(i) == 0, then the corresponding eigenvalue is -! real, LAMBDA(i) = REIG(i). -! If IMEIG(i)>0, then the corresponding complex -! conjugate pair of eigenvalues reads -! LAMBDA(i) = REIG(i) + sqrt(-1)*IMAG(i) -! LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) -! That is, complex conjugate pairs have consecutive -! indices (i,i+1), with the positive imaginary part -! listed first. -! See the descriptions of K, REIG, and Z. +!> \param[out] IMEIG +!> \verbatim +!> IMEIG (output) REAL(KIND=WP) N-by-1 array +!> The leading K (K<=N) entries of IMEIG contain +!> the imaginary parts of the computed eigenvalues +!> REIG(1:K) + sqrt(-1)*IMEIG(1:K). +!> The eigenvalues are determined as follows: +!> If IMEIG(i) == 0, then the corresponding eigenvalue is +!> real, LAMBDA(i) = REIG(i). +!> If IMEIG(i)>0, then the corresponding complex +!> conjugate pair of eigenvalues reads +!> LAMBDA(i) = REIG(i) + sqrt(-1)*IMAG(i) +!> LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) +!> That is, complex conjugate pairs have consecutive +!> indices (i,i+1), with the positive imaginary part +!> listed first. +!> See the descriptions of K, REIG, and Z. +!> \endverbatim !..... -! Z (workspace/output) REAL(KIND=WP) M-by-N array -! If JOBZ =='V' then -! Z contains real Ritz vectors as follows: -! If IMEIG(i)=0, then Z(:,i) is an eigenvector of -! the i-th Ritz value; ||Z(:,i)||_2=1. -! If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then -! [Z(:,i) Z(:,i+1)] span an invariant subspace and -! the Ritz values extracted from this subspace are -! REIG(i) + sqrt(-1)*IMEIG(i) and -! REIG(i) - sqrt(-1)*IMEIG(i). -! The corresponding eigenvectors are -! Z(:,i) + sqrt(-1)*Z(:,i+1) and -! Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. -! || Z(:,i:i+1)||_F = 1. -! If JOBZ == 'F', then the above descriptions hold for -! the columns of X(:,1:K)*W(1:K,1:K), where the columns -! of W(1:k,1:K) are the computed eigenvectors of the -! K-by-K Rayleigh quotient. The columns of W(1:K,1:K) -! are similarly structured: If IMEIG(i) == 0 then -! X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0 -! then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and -! X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1) -! are the eigenvectors of LAMBDA(i), LAMBDA(i+1). -! See the descriptions of REIG, IMEIG, X and W. +!> \param[out] Z +!> \verbatim +!> Z (workspace/output) REAL(KIND=WP) M-by-N array +!> If JOBZ =='V' then +!> Z contains real Ritz vectors as follows: +!> If IMEIG(i)=0, then Z(:,i) is an eigenvector of +!> the i-th Ritz value; ||Z(:,i)||_2=1. +!> If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then +!> [Z(:,i) Z(:,i+1)] span an invariant subspace and +!> the Ritz values extracted from this subspace are +!> REIG(i) + sqrt(-1)*IMEIG(i) and +!> REIG(i) - sqrt(-1)*IMEIG(i). +!> The corresponding eigenvectors are +!> Z(:,i) + sqrt(-1)*Z(:,i+1) and +!> Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. +!> || Z(:,i:i+1)||_F = 1. +!> If JOBZ == 'F', then the above descriptions hold for +!> the columns of X(:,1:K)*W(1:K,1:K), where the columns +!> of W(1:k,1:K) are the computed eigenvectors of the +!> K-by-K Rayleigh quotient. The columns of W(1:K,1:K) +!> are similarly structured: If IMEIG(i) == 0 then +!> X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0 +!> then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and +!> X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1) +!> are the eigenvectors of LAMBDA(i), LAMBDA(i+1). +!> See the descriptions of REIG, IMEIG, X and W. +!> \endverbatim !..... -! LDZ (input) INTEGER , LDZ >= M -! The leading dimension of the array Z. +!> \param[in] LDZ +!> \verbatim +!> LDZ (input) INTEGER , LDZ >= M +!> The leading dimension of the array Z. +!> \endverbatim !..... -! RES (output) REAL(KIND=WP) N-by-1 array -! RES(1:K) contains the residuals for the K computed -! Ritz pairs. -! If LAMBDA(i) is real, then -! RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. -! If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair -! then -! RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F -! where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] -! [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. -! It holds that -! RES(i) = || A*ZC(:,i) - LAMBDA(i) *ZC(:,i) ||_2 -! RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 -! where ZC(:,i) = Z(:,i) + sqrt(-1)*Z(:,i+1) -! ZC(:,i+1) = Z(:,i) - sqrt(-1)*Z(:,i+1) -! See the description of REIG, IMEIG and Z. +!> \param[out] RES +!> \verbatim +!> RES (output) REAL(KIND=WP) N-by-1 array +!> RES(1:K) contains the residuals for the K computed +!> Ritz pairs. +!> If LAMBDA(i) is real, then +!> RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. +!> If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair +!> then +!> RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F +!> where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] +!> [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. +!> It holds that +!> RES(i) = || A*ZC(:,i) - LAMBDA(i) *ZC(:,i) ||_2 +!> RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 +!> where ZC(:,i) = Z(:,i) + sqrt(-1)*Z(:,i+1) +!> ZC(:,i+1) = Z(:,i) - sqrt(-1)*Z(:,i+1) +!> See the description of REIG, IMEIG and Z. +!> \endverbatim !..... -! B (output) REAL(KIND=WP) M-by-N array. -! IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can -! be used for computing the refined vectors; see further -! details in the provided references. -! If JOBF == 'E', B(1:M,1;K) contains -! A*U(:,1:K)*W(1:K,1:K), which are the vectors from the -! Exact DMD, up to scaling by the inverse eigenvalues. -! If JOBF =='N', then B is not referenced. -! See the descriptions of X, W, K. +!> \param[out] B +!> \verbatim +!> B (output) REAL(KIND=WP) M-by-N array. +!> IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can +!> be used for computing the refined vectors; see further +!> details in the provided references. +!> If JOBF == 'E', B(1:M,1;K) contains +!> A*U(:,1:K)*W(1:K,1:K), which are the vectors from the +!> Exact DMD, up to scaling by the inverse eigenvalues. +!> If JOBF =='N', then B is not referenced. +!> See the descriptions of X, W, K. +!> \endverbatim !..... -! LDB (input) INTEGER, LDB >= M -! The leading dimension of the array B. +!> \param[in] LDB +!> \verbatim +!> LDB (input) INTEGER, LDB >= M +!> The leading dimension of the array B. +!> \endverbatim !..... -! W (workspace/output) REAL(KIND=WP) N-by-N array -! On exit, W(1:K,1:K) contains the K computed -! eigenvectors of the matrix Rayleigh quotient (real and -! imaginary parts for each complex conjugate pair of the -! eigenvalues). The Ritz vectors (returned in Z) are the -! product of X (containing a POD basis for the input -! matrix X) and W. See the descriptions of K, S, X and Z. -! W is also used as a workspace to temporarily store the -! right singular vectors of X. +!> \param[out] W +!> \verbatim +!> W (workspace/output) REAL(KIND=WP) N-by-N array +!> On exit, W(1:K,1:K) contains the K computed +!> eigenvectors of the matrix Rayleigh quotient (real and +!> imaginary parts for each complex conjugate pair of the +!> eigenvalues). The Ritz vectors (returned in Z) are the +!> product of X (containing a POD basis for the input +!> matrix X) and W. See the descriptions of K, S, X and Z. +!> W is also used as a workspace to temporarily store the +!> right singular vectors of X. +!> \endverbatim !..... -! LDW (input) INTEGER, LDW >= N -! The leading dimension of the array W. +!> \param[in] LDW +!> \verbatim +!> LDW (input) INTEGER, LDW >= N +!> The leading dimension of the array W. +!> \endverbatim !..... -! S (workspace/output) REAL(KIND=WP) N-by-N array -! The array S(1:K,1:K) is used for the matrix Rayleigh -! quotient. This content is overwritten during -! the eigenvalue decomposition by DGEEV. -! See the description of K. +!> \param[out] S +!> \verbatim +!> S (workspace/output) REAL(KIND=WP) N-by-N array +!> The array S(1:K,1:K) is used for the matrix Rayleigh +!> quotient. This content is overwritten during +!> the eigenvalue decomposition by DGEEV. +!> See the description of K. +!> \endverbatim !..... -! LDS (input) INTEGER, LDS >= N -! The leading dimension of the array S. +!> \param[in] LDS +!> \verbatim +!> LDS (input) INTEGER, LDS >= N +!> The leading dimension of the array S. +!> \endverbatim !..... -! WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array -! On exit, WORK(1:N) contains the singular values of -! X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). -! If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain -! scaling factor WORK(N+2)/WORK(N+1) used to scale X -! and Y to avoid overflow in the SVD of X. -! This may be of interest if the scaling option is off -! and as many as possible smallest eigenvalues are -! desired to the highest feasible accuracy. -! If the call to DGEDMD is only workspace query, then -! WORK(1) contains the minimal workspace length and -! WORK(2) is the optimal workspace length. Hence, the -! leng of work is at least 2. -! See the description of LWORK. +!> \param[out] WORK +!> \verbatim +!> WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array +!> On exit, WORK(1:N) contains the singular values of +!> X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). +!> If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain +!> scaling factor WORK(N+2)/WORK(N+1) used to scale X +!> and Y to avoid overflow in the SVD of X. +!> This may be of interest if the scaling option is off +!> and as many as possible smallest eigenvalues are +!> desired to the highest feasible accuracy. +!> If the call to DGEDMD is only workspace query, then +!> WORK(1) contains the minimal workspace length and +!> WORK(2) is the optimal workspace length. Hence, the +!> leng of work is at least 2. +!> See the description of LWORK. +!> \endverbatim !..... -! LWORK (input) INTEGER -! The minimal length of the workspace vector WORK. -! LWORK is calculated as follows: -! If WHTSVD == 1 :: -! If JOBZ == 'V', then -! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)). -! If JOBZ == 'N' then -! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)). -! Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal -! workspace length of DGESVD. -! If WHTSVD == 2 :: -! If JOBZ == 'V', then -! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)) -! If JOBZ == 'N', then -! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)) -! Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the -! minimal workspace length of DGESDD. -! If WHTSVD == 3 :: -! If JOBZ == 'V', then -! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) -! If JOBZ == 'N', then -! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) -! Here LWORK_SVD = N+M+MAX(3*N+1, -! MAX(1,3*N+M,5*N),MAX(1,N)) -! is the minimal workspace length of DGESVDQ. -! If WHTSVD == 4 :: -! If JOBZ == 'V', then -! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) -! If JOBZ == 'N', then -! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) -! Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the -! minimal workspace length of DGEJSV. -! The above expressions are not simplified in order to -! make the usage of WORK more transparent, and for -! easier checking. In any case, LWORK >= 2. -! If on entry LWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! and the optimal workspace lengths for both WORK and -! IWORK. See the descriptions of WORK and IWORK. +!> \param[in] LWORK +!> \verbatim +!> LWORK (input) INTEGER +!> The minimal length of the workspace vector WORK. +!> LWORK is calculated as follows: +!> If WHTSVD == 1 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)). +!> If JOBZ == 'N' then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)). +!> Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal +!> workspace length of DGESVD. +!> If WHTSVD == 2 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)) +!> If JOBZ == 'N', then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)) +!> Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the +!> minimal workspace length of DGESDD. +!> If WHTSVD == 3 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) +!> If JOBZ == 'N', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) +!> Here LWORK_SVD = N+M+MAX(3*N+1, +!> MAX(1,3*N+M,5*N),MAX(1,N)) +!> is the minimal workspace length of DGESVDQ. +!> If WHTSVD == 4 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) +!> If JOBZ == 'N', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) +!> Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the +!> minimal workspace length of DGEJSV. +!> The above expressions are not simplified in order to +!> make the usage of WORK more transparent, and for +!> easier checking. In any case, LWORK >= 2. +!> If on entry LWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths for both WORK and +!> IWORK. See the descriptions of WORK and IWORK. +!> \endverbatim !..... -! IWORK (workspace/output) INTEGER LIWORK-by-1 array -! Workspace that is required only if WHTSVD equals -! 2 , 3 or 4. (See the description of WHTSVD). -! If on entry LWORK =-1 or LIWORK=-1, then the -! minimal length of IWORK is computed and returned in -! IWORK(1). See the description of LIWORK. +!> \param[out] IWORK +!> \verbatim +!> IWORK (workspace/output) INTEGER LIWORK-by-1 array +!> Workspace that is required only if WHTSVD equals +!> 2 , 3 or 4. (See the description of WHTSVD). +!> If on entry LWORK =-1 or LIWORK=-1, then the +!> minimal length of IWORK is computed and returned in +!> IWORK(1). See the description of LIWORK. +!> \endverbatim !..... -! LIWORK (input) INTEGER -! The minimal length of the workspace vector IWORK. -! If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 -! If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) -! If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) -! If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) -! If on entry LIWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! and the optimal workspace lengths for both WORK and -! IWORK. See the descriptions of WORK and IWORK. +!> \param[in] LIWORK +!> \verbatim +!> LIWORK (input) INTEGER +!> The minimal length of the workspace vector IWORK. +!> If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 +!> If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) +!> If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) +!> If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) +!> If on entry LIWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths for both WORK and +!> IWORK. See the descriptions of WORK and IWORK. +!> \endverbatim !..... -! INFO (output) INTEGER -! -i < 0 :: On entry, the i-th argument had an -! illegal value -! = 0 :: Successful return. -! = 1 :: Void input. Quick exit (M=0 or N=0). -! = 2 :: The SVD computation of X did not converge. -! Suggestion: Check the input data and/or -! repeat with different WHTSVD. -! = 3 :: The computation of the eigenvalues did not -! converge. -! = 4 :: If data scaling was requested on input and -! the procedure found inconsistency in the data -! such that for some column index i, -! X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set -! to zero if JOBS=='C'. The computation proceeds -! with original or modified data and warning -! flag is set with INFO=4. +!> \param[out] INFO +!> \verbatim +!> INFO (output) INTEGER +!> -i < 0 :: On entry, the i-th argument had an +!> illegal value +!> = 0 :: Successful return. +!> = 1 :: Void input. Quick exit (M=0 or N=0). +!> = 2 :: The SVD computation of X did not converge. +!> Suggestion: Check the input data and/or +!> repeat with different WHTSVD. +!> = 3 :: The computation of the eigenvalues did not +!> converge. +!> = 4 :: If data scaling was requested on input and +!> the procedure found inconsistency in the data +!> such that for some column index i, +!> X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set +!> to zero if JOBS=='C'. The computation proceeds +!> with original or modified data and warning +!> flag is set with INFO=4. +!> \endverbatim +! +! Authors: +! ======== +! +!> \author Zlatko Drmac +! +!> \ingroup gedmd +! !............................................................. !............................................................. + SUBROUTINE DGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & + M, N, X, LDX, Y, LDY, NRNK, TOL, & + K, REIG, IMEIG, Z, LDZ, RES, & + B, LDB, W, LDW, S, LDS, & + WORK, LWORK, IWORK, LIWORK, INFO ) +! +! -- LAPACK driver routine -- +! +! -- LAPACK is a software package provided by University of -- +! -- Tennessee, University of California Berkeley, University of -- +! -- Colorado Denver and NAG Ltd.. -- +! +!..... + USE iso_fortran_env + IMPLICIT NONE + INTEGER, PARAMETER :: WP = real64 +! +! Scalar arguments +! ~~~~~~~~~~~~~~~~ + CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF + INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & + NRNK, LDZ, LDB, LDW, LDS, & + LWORK, LIWORK + INTEGER, INTENT(OUT) :: K, INFO + REAL(KIND=WP), INTENT(IN) :: TOL +! +! Array arguments +! ~~~~~~~~~~~~~~~ + REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) + REAL(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & + W(LDW,*), S(LDS,*) + REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & + RES(*) + REAL(KIND=WP), INTENT(OUT) :: WORK(*) + INTEGER, INTENT(OUT) :: IWORK(*) +! ! Parameters ! ~~~~~~~~~~ REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP - +! ! Local scalars ! ~~~~~~~~~~~~~ REAL(KIND=WP) :: OFL, ROOTSC, SCALE, SMALL, & @@ -432,10 +582,11 @@ WNTEX, WNTREF, WNTRES, WNTVEC CHARACTER :: JOBZL, T_OR_N CHARACTER :: JSVOPT - +! ! Local arrays ! ~~~~~~~~~~~~ REAL(KIND=WP) :: AB(2,2), RDUMMY(2), RDUMMY2(2) +! ! External functions (BLAS and LAPACK) ! ~~~~~~~~~~~~~~~~~ REAL(KIND=WP) DLANGE, DLAMCH, DNRM2 @@ -443,13 +594,13 @@ INTEGER IDAMAX LOGICAL DISNAN, LSAME EXTERNAL DISNAN, LSAME - +! ! External subroutines (BLAS and LAPACK) ! ~~~~~~~~~~~~~~~~~~~~ EXTERNAL DAXPY, DGEMM, DSCAL EXTERNAL DGEEV, DGEJSV, DGESDD, DGESVD, DGESVDQ, & DLACPY, DLASCL, DLASSQ, XERBLA - +! ! Intrinsic functions ! ~~~~~~~~~~~~~~~~~~~ INTRINSIC DBLE, INT, MAX, SQRT @@ -632,7 +783,8 @@ K = 0 DO i = 1, N !WORK(i) = DNRM2( M, X(1,i), 1 ) - SCALE = ZERO + SSUM = ONE + SCALE = ZERO CALL DLASSQ( M, X(1,i), 1, SCALE, SSUM ) IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN K = 0 @@ -705,7 +857,8 @@ ! carefully computed using DLASSQ. DO i = 1, N !WORK(i) = DNRM2( M, Y(1,i), 1 ) - SCALE = ZERO + SSUM = ONE + SCALE = ZERO CALL DLASSQ( M, Y(1,i), 1, SCALE, SSUM ) IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN K = 0 @@ -1051,4 +1204,3 @@ RETURN ! ...... END SUBROUTINE DGEDMD - diff --git a/lapack-netlib/SRC/dgehrd.f b/lapack-netlib/SRC/dgehrd.f index a40c61cb6..d95bbd182 100644 --- a/lapack-netlib/SRC/dgehrd.f +++ b/lapack-netlib/SRC/dgehrd.f @@ -89,7 +89,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (LWORK) +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup gehrd * *> \par Further Details: * ===================== @@ -173,7 +173,7 @@ INTEGER IHI, ILO, INFO, LDA, LWORK, N * .. * .. Array Arguments .. - DOUBLE PRECISION A( LDA, * ), TAU( * ), WORK( * ) + DOUBLE PRECISION A( LDA, * ), TAU( * ), WORK( * ) * .. * * ===================================================================== @@ -182,7 +182,7 @@ INTEGER NBMAX, LDT, TSIZE PARAMETER ( NBMAX = 64, LDT = NBMAX+1, $ TSIZE = LDT*NBMAX ) - DOUBLE PRECISION ZERO, ONE + DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, $ ONE = 1.0D+0 ) * .. @@ -190,7 +190,7 @@ LOGICAL LQUERY INTEGER I, IB, IINFO, IWT, J, LDWORK, LWKOPT, NB, $ NBMIN, NH, NX - DOUBLE PRECISION EI + DOUBLE PRECISION EI * .. * .. External Subroutines .. EXTERNAL DAXPY, DGEHD2, DGEMM, DLAHR2, DLARFB, DTRMM, @@ -221,12 +221,18 @@ INFO = -8 END IF * + NH = IHI - ILO + 1 IF( INFO.EQ.0 ) THEN * * Compute the workspace requirements * - NB = MIN( NBMAX, ILAENV( 1, 'DGEHRD', ' ', N, ILO, IHI, -1 ) ) - LWKOPT = N*NB + TSIZE + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + NB = MIN( NBMAX, ILAENV( 1, 'DGEHRD', ' ', N, ILO, IHI, + $ -1 ) ) + LWKOPT = N*NB + TSIZE + ENDIF WORK( 1 ) = LWKOPT END IF * @@ -248,7 +254,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = 1 RETURN @@ -268,7 +273,7 @@ * * Determine if workspace is large enough for blocked code * - IF( LWORK.LT.N*NB+TSIZE ) THEN + IF( LWORK.LT.LWKOPT ) THEN * * Not enough workspace to use optimal NB: determine the * minimum value of NB, and reduce NB or force use of @@ -344,6 +349,7 @@ * Use unblocked code to reduce the rest of the matrix * CALL DGEHD2( N, I, IHI, A, LDA, TAU, WORK, IINFO ) +* WORK( 1 ) = LWKOPT * RETURN diff --git a/lapack-netlib/SRC/dgelq.f b/lapack-netlib/SRC/dgelq.f index 013b6c356..255e8732f 100644 --- a/lapack-netlib/SRC/dgelq.f +++ b/lapack-netlib/SRC/dgelq.f @@ -98,7 +98,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -166,6 +166,8 @@ *> the LQ factorization. *> \endverbatim *> +*> \ingroup gelq +*> * ===================================================================== SUBROUTINE DGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) diff --git a/lapack-netlib/SRC/dgelqf.f b/lapack-netlib/SRC/dgelqf.f index ed3372f96..f0eb00a55 100644 --- a/lapack-netlib/SRC/dgelqf.f +++ b/lapack-netlib/SRC/dgelqf.f @@ -93,7 +93,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,M). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M, otherwise. *> For optimum performance LWORK >= M*NB, where NB is the *> optimal blocksize. *> @@ -118,7 +119,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup gelqf * *> \par Further Details: * ===================== @@ -174,9 +175,8 @@ * Test the input arguments * INFO = 0 + K = MIN( M, N ) NB = ILAENV( 1, 'DGELQF', ' ', M, N, -1, -1 ) - LWKOPT = M*NB - WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -184,19 +184,25 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + ELSE IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) ) + $ INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DGELQF', -INFO ) RETURN ELSE IF( LQUERY ) THEN + IF( K.EQ.0 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = M*NB + END IF + WORK( 1 ) = LWKOPT RETURN END IF * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN diff --git a/lapack-netlib/SRC/dgelsd.f b/lapack-netlib/SRC/dgelsd.f index b1f45a2c6..7dc564f48 100644 --- a/lapack-netlib/SRC/dgelsd.f +++ b/lapack-netlib/SRC/dgelsd.f @@ -188,7 +188,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEsolve +*> \ingroup gelsd * *> \par Contributors: * ================== @@ -228,7 +228,7 @@ DOUBLE PRECISION ANRM, BIGNUM, BNRM, EPS, SFMIN, SMLNUM * .. * .. External Subroutines .. - EXTERNAL DGEBRD, DGELQF, DGEQRF, DLABAD, DLACPY, DLALSD, + EXTERNAL DGEBRD, DGELQF, DGEQRF, DLACPY, DLALSD, $ DLASCL, DLASET, DORMBR, DORMLQ, DORMQR, XERBLA * .. * .. External Functions .. @@ -276,7 +276,7 @@ $ LOG( TWO ) ) + 1, 0 ) * IF( INFO.EQ.0 ) THEN - MAXWRK = 0 + MAXWRK = 1 LIWORK = 3*MINMN*NLVL + 11*MINMN MM = M IF( M.GE.N .AND. M.GE.MNTHR ) THEN @@ -372,7 +372,6 @@ SFMIN = DLAMCH( 'S' ) SMLNUM = SFMIN / EPS BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) * * Scale A if max entry outside range [SMLNUM,BIGNUM]. * diff --git a/lapack-netlib/SRC/dgemlq.f b/lapack-netlib/SRC/dgemlq.f index 3ba209105..757683f46 100644 --- a/lapack-netlib/SRC/dgemlq.f +++ b/lapack-netlib/SRC/dgemlq.f @@ -111,16 +111,17 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -144,7 +145,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -160,11 +161,13 @@ *> block sizes MB and NB returned by ILAENV, DGELQ will use either *> DLASWLQ (if the matrix is wide-and-short) or DGELQT to compute *> the LQ factorization. -*> This version of DGEMLQ will use either DLAMSWLQ or DGEMLQT to +*> This version of DGEMLQ will use either DLAMSWLQ or DGEMLQT to *> multiply matrix Q by another matrix. *> Further Details in DLAMSWLQ or DGEMLQT. *> \endverbatim *> +*> \ingroup gemlq +*> * ===================================================================== SUBROUTINE DGEMLQ( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -186,7 +189,7 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -202,7 +205,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) @@ -217,6 +220,13 @@ LW = M * MB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( NB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, NB - K ) .EQ. 0 ) THEN @@ -245,12 +255,12 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LW + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN @@ -262,7 +272,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -275,7 +285,7 @@ $ MB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = LW + WORK( 1 ) = LWMIN * RETURN * diff --git a/lapack-netlib/SRC/dgemqr.f b/lapack-netlib/SRC/dgemqr.f index 022cf21e4..608815483 100644 --- a/lapack-netlib/SRC/dgemqr.f +++ b/lapack-netlib/SRC/dgemqr.f @@ -111,16 +111,17 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -144,7 +145,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -160,12 +161,14 @@ *> block sizes MB and NB returned by ILAENV, DGEQR will use either *> DLATSQR (if the matrix is tall-and-skinny) or DGEQRT to compute *> the QR factorization. -*> This version of DGEMQR will use either DLAMTSQR or DGEMQRT to +*> This version of DGEMQR will use either DLAMTSQR or DGEMQRT to *> multiply matrix Q by another matrix. *> Further Details in DLATMSQR or DGEMQRT. *> *> \endverbatim *> +*> \ingroup gemqr +*> * ===================================================================== SUBROUTINE DGEMQR( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -187,7 +190,7 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -203,7 +206,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) @@ -218,6 +221,13 @@ LW = MB * NB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( MB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, MB - K ).EQ.0 ) THEN @@ -246,12 +256,12 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LW + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN @@ -263,7 +273,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -276,7 +286,7 @@ $ NB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = LW + WORK( 1 ) = LWMIN * RETURN * diff --git a/lapack-netlib/SRC/dgeqlf.f b/lapack-netlib/SRC/dgeqlf.f index b8ac0b1a0..a72d9dc76 100644 --- a/lapack-netlib/SRC/dgeqlf.f +++ b/lapack-netlib/SRC/dgeqlf.f @@ -88,7 +88,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is the *> optimal blocksize. *> @@ -113,7 +114,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup geqlf * *> \par Further Details: * ===================== @@ -188,8 +189,9 @@ END IF WORK( 1 ) = LWKOPT * - IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) ) + $ INFO = -7 END IF END IF * diff --git a/lapack-netlib/SRC/dgeqp3rk.f b/lapack-netlib/SRC/dgeqp3rk.f index 117a68287..b8e41b39c 100644 --- a/lapack-netlib/SRC/dgeqp3rk.f +++ b/lapack-netlib/SRC/dgeqp3rk.f @@ -427,7 +427,8 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*. LWORK >= (3*N + NRHS - 1) +*> LWORK >= 1, if MIN(M,N) = 0, and +*> LWORK >= (3*N+NRHS-1), otherwise. *> For optimal performance LWORK >= (2*N + NB*( N+NRHS+1 )), *> where NB is the optimal block size for DGEQP3RK returned *> by ILAENV. Minimal block size MINNB=2. diff --git a/lapack-netlib/SRC/dgeqr.f b/lapack-netlib/SRC/dgeqr.f index eac8930ce..6ed8f211f 100644 --- a/lapack-netlib/SRC/dgeqr.f +++ b/lapack-netlib/SRC/dgeqr.f @@ -99,7 +99,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -168,6 +168,8 @@ *> *> \endverbatim *> +*> \ingroup geqr +*> * ===================================================================== SUBROUTINE DGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) @@ -188,7 +190,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, LMINWS, MINT, MINW - INTEGER MB, NB, MINTSZ, NBLCKS + INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWREQ * .. * .. External Functions .. LOGICAL LSAME @@ -244,8 +246,10 @@ * * Determine if the workspace size satisfies minimal size * + LWMIN = MAX( 1, N ) + LWREQ = MAX( 1, N*NB ) LMINWS = .FALSE. - IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.NB*N ) + IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.LWREQ ) $ .AND. ( LWORK.GE.N ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( .NOT.LQUERY ) ) THEN IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) ) THEN @@ -253,7 +257,7 @@ NB = 1 MB = M END IF - IF( LWORK.LT.NB*N ) THEN + IF( LWORK.LT.LWREQ ) THEN LMINWS = .TRUE. NB = 1 END IF @@ -268,7 +272,7 @@ ELSE IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN INFO = -6 - ELSE IF( ( LWORK.LT.MAX( 1, N*NB ) ) .AND. ( .NOT.LQUERY ) + ELSE IF( ( LWORK.LT.LWREQ ) .AND. ( .NOT.LQUERY ) $ .AND. ( .NOT.LMINWS ) ) THEN INFO = -8 END IF @@ -282,9 +286,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = MAX( 1, N ) + WORK( 1 ) = LWMIN ELSE - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = LWREQ END IF END IF IF( INFO.NE.0 ) THEN @@ -309,7 +313,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = LWREQ * RETURN * diff --git a/lapack-netlib/SRC/dgeqrfp.f b/lapack-netlib/SRC/dgeqrfp.f index 46d2ee479..aa757e96c 100644 --- a/lapack-netlib/SRC/dgeqrfp.f +++ b/lapack-netlib/SRC/dgeqrfp.f @@ -97,7 +97,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is *> the optimal blocksize. *> @@ -122,7 +123,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup geqrfp * *> \par Further Details: * ===================== @@ -162,8 +163,8 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKOPT, NB, - $ NBMIN, NX + INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKMIN, LWKOPT, + $ NB, NBMIN, NX * .. * .. External Subroutines .. EXTERNAL DGEQR2P, DLARFB, DLARFT, XERBLA @@ -181,8 +182,16 @@ * INFO = 0 NB = ILAENV( 1, 'DGEQRF', ' ', M, N, -1, -1 ) - LWKOPT = N*NB + K = MIN( M, N ) + IF( K.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = N + LWKOPT = N*NB + END IF WORK( 1 ) = LWKOPT +* LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -190,7 +199,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF IF( INFO.NE.0 ) THEN @@ -202,7 +211,6 @@ * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -210,7 +218,7 @@ * NBMIN = 2 NX = 0 - IWS = N + IWS = LWKMIN IF( NB.GT.1 .AND. NB.LT.K ) THEN * * Determine when to cross over from blocked to unblocked code. diff --git a/lapack-netlib/SRC/dgerqf.f b/lapack-netlib/SRC/dgerqf.f index cca9d6367..435239cc7 100644 --- a/lapack-netlib/SRC/dgerqf.f +++ b/lapack-netlib/SRC/dgerqf.f @@ -114,7 +114,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup gerqf * *> \par Further Details: * ===================== @@ -189,7 +189,7 @@ END IF WORK( 1 ) = LWKOPT * - IF ( .NOT.LQUERY ) THEN + IF( .NOT.LQUERY ) THEN IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) ) $ INFO = -7 END IF diff --git a/lapack-netlib/SRC/dgesvj.f b/lapack-netlib/SRC/dgesvj.f index 5fdb21e45..198bfb0a5 100644 --- a/lapack-netlib/SRC/dgesvj.f +++ b/lapack-netlib/SRC/dgesvj.f @@ -208,7 +208,7 @@ *> *> \param[in,out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (LWORK) +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) *> On entry : *> If JOBU = 'C' : *> WORK(1) = CTOL, where CTOL defines the threshold for convergence. @@ -239,7 +239,12 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> length of WORK, WORK >= MAX(6,M+N) +*> The length of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(6,M+N), otherwise. +*> +*> If on entry LWORK = -1, then a workspace query is assumed and +*> no computation is done; WORK(1) is set to the minial (and optimal) +*> length of WORK. *> \endverbatim *> *> \param[out] INFO @@ -260,7 +265,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup gesvj * *> \par Further Details: * ===================== @@ -365,9 +370,9 @@ INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1, $ ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34, $ N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, - $ SWBAND - LOGICAL APPLV, GOSCALE, LOWER, LSVEC, NOSCALE, ROTOK, - $ RSVEC, UCTOL, UPPER + $ SWBAND, MINMN, LWMIN + LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, + $ ROTOK, RSVEC, UCTOL, UPPER * .. * .. Local Arrays .. DOUBLE PRECISION FASTR( 5 ) @@ -408,6 +413,14 @@ UPPER = LSAME( JOBA, 'U' ) LOWER = LSAME( JOBA, 'L' ) * + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 6, M+N ) + END IF +* + LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.( UPPER .OR. LOWER .OR. LSAME( JOBA, 'G' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( LSVEC .OR. UCTOL .OR. LSAME( JOBU, 'N' ) ) ) THEN @@ -427,7 +440,7 @@ INFO = -11 ELSE IF( UCTOL .AND. ( WORK( 1 ).LE.ONE ) ) THEN INFO = -12 - ELSE IF( LWORK.LT.MAX( M+N, 6 ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. ( .NOT.LQUERY ) ) THEN INFO = -13 ELSE INFO = 0 @@ -437,11 +450,14 @@ IF( INFO.NE.0 ) THEN CALL XERBLA( 'DGESVJ', -INFO ) RETURN + ELSE IF( LQUERY ) THEN + WORK( 1 ) = LWMIN + RETURN END IF * * #:) Quick return for void matrix * - IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) )RETURN + IF( MINMN.EQ.0 ) RETURN * * Set numerical parameters * The stopping criterion for Jacobi rotations is diff --git a/lapack-netlib/SRC/dgetri.f b/lapack-netlib/SRC/dgetri.f index 92ef90c18..7b5a3a1b6 100644 --- a/lapack-netlib/SRC/dgetri.f +++ b/lapack-netlib/SRC/dgetri.f @@ -107,7 +107,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup getri * * ===================================================================== SUBROUTINE DGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -151,8 +151,9 @@ * INFO = 0 NB = ILAENV( 1, 'DGETRI', ' ', N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT +* LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN INFO = -1 diff --git a/lapack-netlib/SRC/dgetsls.f b/lapack-netlib/SRC/dgetsls.f index 25f4c12c2..73b505ff7 100644 --- a/lapack-netlib/SRC/dgetsls.f +++ b/lapack-netlib/SRC/dgetsls.f @@ -127,7 +127,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. *> If LWORK = -1, the routine calculates optimal size of WORK for the *> optimal performance and returns this value in WORK(1). @@ -154,7 +154,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEsolve +*> \ingroup getsls * * ===================================================================== SUBROUTINE DGETSLS( TRANS, M, N, NRHS, A, LDA, B, LDB, @@ -189,7 +189,7 @@ * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION DLAMCH, DLANGE - EXTERNAL LSAME, DLABAD, DLAMCH, DLANGE + EXTERNAL LSAME, DLAMCH, DLANGE * .. * .. External Subroutines .. EXTERNAL DGEQR, DGEMQR, DLASCL, DLASET, @@ -226,7 +226,10 @@ * * Determine the optimum and minimum LWORK * - IF( M.GE.N ) THEN + IF( MIN( M, N, NRHS ).EQ.0 ) THEN + WSIZEM = 1 + WSIZEO = 1 + ELSE IF( M.GE.N ) THEN CALL DGEQR( M, N, A, LDA, TQ, -1, WORKQ, -1, INFO2 ) TSZO = INT( TQ( 1 ) ) LWO = INT( WORKQ( 1 ) ) @@ -294,7 +297,6 @@ * SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) * * Scale A, B if max element outside range [SMLNUM,BIGNUM] * diff --git a/lapack-netlib/SRC/dgetsqrhrt.f b/lapack-netlib/SRC/dgetsqrhrt.f index 668deeba8..682c7c30f 100644 --- a/lapack-netlib/SRC/dgetsqrhrt.f +++ b/lapack-netlib/SRC/dgetsqrhrt.f @@ -130,14 +130,17 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The dimension of the array WORK. -*> LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), +*> If MIN(M,N) = 0, LWORK >= 1, else +*> LWORK >= MAX( 1, LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), *> where *> NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)), *> NB1LOCAL = MIN(NB1,N). *> LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL, *> LW1 = NB1LOCAL * N, -*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ), +*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ). +*> *> If LWORK = -1, then a workspace query is assumed. *> The routine only calculates the optimal size of the WORK *> array, returns this value as the first entry of the WORK @@ -160,7 +163,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERcomputational +*> \ingroup getsqrhrt * *> \par Contributors: * ================== @@ -212,7 +215,7 @@ * Test the input arguments * INFO = 0 - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. M.LT.N ) THEN @@ -225,7 +228,7 @@ INFO = -5 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -7 - ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN + ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN INFO = -9 ELSE * @@ -263,8 +266,9 @@ LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ) * LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ) + LWORKOPT = MAX( 1, LWORKOPT ) * - IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + IF( LWORK.LT.LWORKOPT .AND. .NOT.LQUERY ) THEN INFO = -11 END IF * @@ -346,4 +350,4 @@ * * End of DGETSQRHRT * - END \ No newline at end of file + END diff --git a/lapack-netlib/SRC/dgges.f b/lapack-netlib/SRC/dgges.f index 31db23715..b9ffc7982 100644 --- a/lapack-netlib/SRC/dgges.f +++ b/lapack-netlib/SRC/dgges.f @@ -234,8 +234,8 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> If N = 0, LWORK >= 1, else LWORK >= 8*N+16. -*> For good performance , LWORK must generally be larger. +*> If N = 0, LWORK >= 1, else LWORK >= MAX(8*N,6*N+16). +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -275,7 +275,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEeigen +*> \ingroup gges * * ===================================================================== SUBROUTINE DGGES( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B, LDB, @@ -321,9 +321,8 @@ DOUBLE PRECISION DIF( 2 ) * .. * .. External Subroutines .. - EXTERNAL DGEQRF, DGGBAK, DGGBAL, DGGHRD, DHGEQZ, DLABAD, - $ DLACPY, DLASCL, DLASET, DORGQR, DORMQR, DTGSEN, - $ XERBLA + EXTERNAL DGEQRF, DGGBAK, DGGBAL, DGGHRD, DHGEQZ, DLACPY, + $ DLASCL, DLASET, DORGQR, DORMQR, DTGSEN, XERBLA * .. * .. External Functions .. LOGICAL LSAME @@ -431,7 +430,6 @@ EPS = DLAMCH( 'P' ) SAFMIN = DLAMCH( 'S' ) SAFMAX = ONE / SAFMIN - CALL DLABAD( SAFMIN, SAFMAX ) SMLNUM = SQRT( SAFMIN ) / EPS BIGNUM = ONE / SMLNUM * diff --git a/lapack-netlib/SRC/dgges3.f b/lapack-netlib/SRC/dgges3.f index 7b00d294a..2ef55951a 100644 --- a/lapack-netlib/SRC/dgges3.f +++ b/lapack-netlib/SRC/dgges3.f @@ -234,6 +234,8 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If N = 0, LWORK >= 1, else LWORK >= 6*N+16. +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -273,7 +275,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEeigen +*> \ingroup gges3 * * ===================================================================== SUBROUTINE DGGES3( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B, @@ -309,7 +311,8 @@ LOGICAL CURSL, ILASCL, ILBSCL, ILVSL, ILVSR, LASTSL, $ LQUERY, LST2SL, WANTST INTEGER I, ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, - $ ILO, IP, IRIGHT, IROWS, ITAU, IWRK, LWKOPT + $ ILO, IP, IRIGHT, IROWS, ITAU, IWRK, LWKOPT, + $ LWKMIN DOUBLE PRECISION ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, PVSL, $ PVSR, SAFMAX, SAFMIN, SMLNUM * .. @@ -318,9 +321,8 @@ DOUBLE PRECISION DIF( 2 ) * .. * .. External Subroutines .. - EXTERNAL DGEQRF, DGGBAK, DGGBAL, DGGHD3, DLAQZ0, DLABAD, - $ DLACPY, DLASCL, DLASET, DORGQR, DORMQR, DTGSEN, - $ XERBLA + EXTERNAL DGEQRF, DGGBAK, DGGBAL, DGGHD3, DLAQZ0, DLACPY, + $ DLASCL, DLASET, DORGQR, DORMQR, DTGSEN, XERBLA * .. * .. External Functions .. LOGICAL LSAME @@ -362,6 +364,12 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + IF( N.EQ.0 ) THEN + LWKMIN = 1 + ELSE + LWKMIN = 6*N+16 + END IF +* IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -378,7 +386,7 @@ INFO = -15 ELSE IF( LDVSR.LT.1 .OR. ( ILVSR .AND. LDVSR.LT.N ) ) THEN INFO = -17 - ELSE IF( LWORK.LT.6*N+16 .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -19 END IF * @@ -386,29 +394,33 @@ * IF( INFO.EQ.0 ) THEN CALL DGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( 6*N+16, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKMIN, 3*N+INT( WORK( 1 ) ) ) CALL DORMQR( 'L', 'T', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) IF( ILVSL ) THEN CALL DORGQR( N, N, N, VSL, LDVSL, WORK, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) END IF CALL DGGHD3( JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, VSL, $ LDVSL, VSR, LDVSR, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) CALL DLAQZ0( 'S', JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, $ ALPHAR, ALPHAI, BETA, VSL, LDVSL, VSR, LDVSR, $ WORK, -1, 0, IERR ) - LWKOPT = MAX( LWKOPT, 2*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) IF( WANTST ) THEN CALL DTGSEN( 0, ILVSL, ILVSR, BWORK, N, A, LDA, B, LDB, $ ALPHAR, ALPHAI, BETA, VSL, LDVSL, VSR, LDVSR, $ SDIM, PVSL, PVSR, DIF, WORK, -1, IDUM, 1, $ IERR ) - LWKOPT = MAX( LWKOPT, 2*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) + END IF + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = LWKOPT END IF - WORK( 1 ) = LWKOPT END IF * IF( INFO.NE.0 ) THEN @@ -430,7 +442,6 @@ EPS = DLAMCH( 'P' ) SAFMIN = DLAMCH( 'S' ) SAFMAX = ONE / SAFMIN - CALL DLABAD( SAFMIN, SAFMAX ) SMLNUM = SQRT( SAFMIN ) / EPS BIGNUM = ONE / SMLNUM * diff --git a/lapack-netlib/SRC/dggev3.f b/lapack-netlib/SRC/dggev3.f index 4bbe8a40f..b970c04c4 100644 --- a/lapack-netlib/SRC/dggev3.f +++ b/lapack-netlib/SRC/dggev3.f @@ -188,7 +188,9 @@ *> *> \param[in] LWORK *> \verbatim -*> LWORK is INTEGER +*> LWORK is INTEGER. +*> The dimension of the array WORK. LWORK >= MAX(1,8*N). +*> For good performance, LWORK should generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -217,7 +219,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEeigen +*> \ingroup ggev3 * * ===================================================================== SUBROUTINE DGGEV3( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHAR, @@ -248,7 +250,8 @@ LOGICAL ILASCL, ILBSCL, ILV, ILVL, ILVR, LQUERY CHARACTER CHTEMP INTEGER ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, ILO, - $ IN, IRIGHT, IROWS, ITAU, IWRK, JC, JR, LWKOPT + $ IN, IRIGHT, IROWS, ITAU, IWRK, JC, JR, LWKOPT, + $ LWKMIN DOUBLE PRECISION ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, $ SMLNUM, TEMP * .. @@ -256,9 +259,8 @@ LOGICAL LDUMMA( 1 ) * .. * .. External Subroutines .. - EXTERNAL DGEQRF, DGGBAK, DGGBAL, DGGHD3, DLAQZ0, DLABAD, - $ DLACPY, DLASCL, DLASET, DORGQR, DORMQR, DTGEVC, - $ XERBLA + EXTERNAL DGEQRF, DGGBAK, DGGBAL, DGGHD3, DLAQZ0, DLACPY, + $ DLASCL, DLASET, DORGQR, DORMQR, DTGEVC, XERBLA * .. * .. External Functions .. LOGICAL LSAME @@ -299,6 +301,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 8*N ) IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -313,7 +316,7 @@ INFO = -12 ELSE IF( LDVR.LT.1 .OR. ( ILVR .AND. LDVR.LT.N ) ) THEN INFO = -14 - ELSE IF( LWORK.LT.MAX( 1, 8*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF * @@ -321,13 +324,13 @@ * IF( INFO.EQ.0 ) THEN CALL DGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX(1, 8*N, 3*N+INT( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, 3*N+INT( WORK( 1 ) ) ) CALL DORMQR( 'L', 'T', N, N, N, B, LDB, WORK, A, LDA, WORK, -1, $ IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) IF( ILVL ) THEN CALL DORGQR( N, N, N, VL, LDVL, WORK, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) END IF IF( ILV ) THEN CALL DGGHD3( JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, VL, @@ -336,18 +339,21 @@ CALL DLAQZ0( 'S', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, $ ALPHAR, ALPHAI, BETA, VL, LDVL, VR, LDVR, $ WORK, -1, 0, IERR ) - LWKOPT = MAX( LWKOPT, 2*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) ELSE CALL DGGHD3( 'N', 'N', N, 1, N, A, LDA, B, LDB, VL, LDVL, $ VR, LDVR, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) CALL DLAQZ0( 'E', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, $ ALPHAR, ALPHAI, BETA, VL, LDVL, VR, LDVR, $ WORK, -1, 0, IERR ) - LWKOPT = MAX( LWKOPT, 2*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) + END IF + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = LWKOPT END IF - - WORK( 1 ) = LWKOPT END IF * IF( INFO.NE.0 ) THEN @@ -367,7 +373,6 @@ EPS = DLAMCH( 'P' ) SMLNUM = DLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * diff --git a/lapack-netlib/SRC/dgghd3.f b/lapack-netlib/SRC/dgghd3.f index 43d7a77df..21a668573 100644 --- a/lapack-netlib/SRC/dgghd3.f +++ b/lapack-netlib/SRC/dgghd3.f @@ -179,14 +179,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (LWORK) +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> -*> \param[in] LWORK +*> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= 1. +*> The length of the array WORK. LWORK >= 1. *> For optimum performance LWORK >= 6*N*NB, where NB is the *> optimal blocksize. *> @@ -211,7 +211,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERcomputational +*> \ingroup gghd3 * *> \par Further Details: * ===================== @@ -275,7 +275,12 @@ * INFO = 0 NB = ILAENV( 1, 'DGGHD3', ' ', N, ILO, IHI, -1 ) - LWKOPT = MAX( 6*N*NB, 1 ) + NH = IHI - ILO + 1 + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = 6*N*NB + END IF WORK( 1 ) = DBLE( LWKOPT ) INITQ = LSAME( COMPQ, 'I' ) WANTQ = INITQ .OR. LSAME( COMPQ, 'V' ) @@ -325,7 +330,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = ONE RETURN @@ -885,6 +889,7 @@ IF ( JCOL.LT.IHI ) $ CALL DGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q, $ LDQ, Z, LDZ, IERR ) +* WORK( 1 ) = DBLE( LWKOPT ) * RETURN diff --git a/lapack-netlib/SRC/dggqrf.f b/lapack-netlib/SRC/dggqrf.f index 39d27a5c9..edac7f22f 100644 --- a/lapack-netlib/SRC/dggqrf.f +++ b/lapack-netlib/SRC/dggqrf.f @@ -173,7 +173,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERcomputational +*> \ingroup ggqrf * *> \par Further Details: * ===================== @@ -250,7 +250,7 @@ NB2 = ILAENV( 1, 'DGERQF', ' ', N, P, -1, -1 ) NB3 = ILAENV( 1, 'DORMQR', ' ', N, M, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P )*NB + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN @@ -287,6 +287,7 @@ * RQ factorization of N-by-P matrix B: B = T*Z. * CALL DGERQF( N, P, B, LDB, TAUB, WORK, LWORK, INFO ) +* WORK( 1 ) = MAX( LOPT, INT( WORK( 1 ) ) ) * RETURN diff --git a/lapack-netlib/SRC/dggrqf.f b/lapack-netlib/SRC/dggrqf.f index ddf4104c5..3b1024c1c 100644 --- a/lapack-netlib/SRC/dggrqf.f +++ b/lapack-netlib/SRC/dggrqf.f @@ -172,7 +172,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERcomputational +*> \ingroup ggrqf * *> \par Further Details: * ===================== @@ -249,7 +249,7 @@ NB2 = ILAENV( 1, 'DGEQRF', ' ', P, N, -1, -1 ) NB3 = ILAENV( 1, 'DORMRQ', ' ', M, N, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P )*NB + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN diff --git a/lapack-netlib/SRC/dggsvd3.f b/lapack-netlib/SRC/dggsvd3.f index 503f0d8cc..ee4d11e86 100644 --- a/lapack-netlib/SRC/dggsvd3.f +++ b/lapack-netlib/SRC/dggsvd3.f @@ -278,7 +278,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -328,7 +328,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEsing +*> \ingroup ggsvd3 * *> \par Contributors: * ================== diff --git a/lapack-netlib/SRC/dggsvp3.f b/lapack-netlib/SRC/dggsvp3.f index 4e1db3117..485d95b36 100644 --- a/lapack-netlib/SRC/dggsvp3.f +++ b/lapack-netlib/SRC/dggsvp3.f @@ -227,7 +227,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -250,7 +250,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERcomputational +*> \ingroup ggsvp3 * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/dlamswlq.f b/lapack-netlib/SRC/dlamswlq.f index 70e78f4b1..07ef1bd57 100644 --- a/lapack-netlib/SRC/dlamswlq.f +++ b/lapack-netlib/SRC/dlamswlq.f @@ -127,17 +127,20 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> If SIDE = 'L', LWORK >= max(1,NB) * MB; -*> if SIDE = 'R', LWORK >= max(1,M) * MB. +*> +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,NB*MB). +*> If SIDE = 'R', LWORK >= max(1,M*MB). *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -189,29 +192,31 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamswlq +*> * ===================================================================== SUBROUTINE DLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - DOUBLE PRECISION A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + DOUBLE PRECISION A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, CTR, LW + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, CTR, LW, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -223,52 +228,60 @@ * * Test the input arguments * - LQUERY = LWORK.LT.0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * MB ELSE LW = M * MB END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * INFO = 0 IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( K.LT.0 ) THEN INFO = -5 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN INFO = -4 - ELSE IF( K.LT.MB .OR. MB.LT.1) THEN + ELSE IF( K.LT.MB .OR. MB.LT.1 ) THEN INFO = -6 ELSE IF( LDA.LT.MAX( 1, K ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, MB) ) THEN + ELSE IF( LDT.LT.MAX( 1, MB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN + END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DLAMSWLQ', -INFO ) - WORK(1) = LW RETURN - ELSE IF (LQUERY) THEN - WORK(1) = LW + ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -402,7 +415,8 @@ * END IF * - WORK(1) = LW + WORK( 1 ) = LWMIN +* RETURN * * End of DLAMSWLQ diff --git a/lapack-netlib/SRC/dlamtsqr.f b/lapack-netlib/SRC/dlamtsqr.f index 962a31476..023db5ac9 100644 --- a/lapack-netlib/SRC/dlamtsqr.f +++ b/lapack-netlib/SRC/dlamtsqr.f @@ -128,22 +128,24 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) -*> +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,N*NB). +*> If SIDE = 'R', LWORK >= max(1,MB*NB). *> -*> If SIDE = 'L', LWORK >= max(1,N)*NB; -*> if SIDE = 'R', LWORK >= max(1,MB)*NB. *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -191,29 +193,31 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamtsqr +*> * ===================================================================== SUBROUTINE DLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - DOUBLE PRECISION A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + DOUBLE PRECISION A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR, Q + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, Q, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -225,12 +229,13 @@ * * Test the input arguments * - LQUERY = LWORK.LT.0 + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * NB Q = M ELSE @@ -238,11 +243,17 @@ Q = N END IF * - INFO = 0 + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF +* IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN @@ -253,38 +264,38 @@ INFO = -7 ELSE IF( LDA.LT.MAX( 1, Q ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, NB) ) THEN + ELSE IF( LDT.LT.MAX( 1, NB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * -* Determine the block size if it is tall skinny or short and wide -* - IF( INFO.EQ.0) THEN - WORK(1) = LW + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'DLAMTSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF +* +* Determine the block size if it is tall skinny or short and wide * IF((MB.LE.K).OR.(MB.GE.MAX(M,N,K))) THEN CALL DGEMQRT( SIDE, TRANS, M, N, K, NB, A, LDA, - $ T, LDT, C, LDC, WORK, INFO) + $ T, LDT, C, LDC, WORK, INFO ) RETURN - END IF + END IF * IF(LEFT.AND.NOTRAN) THEN * @@ -410,7 +421,8 @@ * END IF * - WORK(1) = LW + WORK( 1 ) = LWMIN +* RETURN * * End of DLAMTSQR diff --git a/lapack-netlib/SRC/dlaswlq.f b/lapack-netlib/SRC/dlaswlq.f index c95c94cbc..636c12dc8 100644 --- a/lapack-netlib/SRC/dlaswlq.f +++ b/lapack-netlib/SRC/dlaswlq.f @@ -99,19 +99,22 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) -*> +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= MB*M. +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MB*M, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -159,33 +162,37 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup laswlq +*> * ===================================================================== SUBROUTINE DLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, - $ INFO) + $ INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT + INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT * .. * .. Array Arguments .. - DOUBLE PRECISION A( LDA, * ), WORK( * ), T( LDT, *) + DOUBLE PRECISION A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, MINMN, LWMIN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME EXTERNAL LSAME +* .. * .. EXTERNAL SUBROUTINES .. EXTERNAL DGELQT, DTPLQT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -196,12 +203,19 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = M*MB + END IF * IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. N.LT.M ) THEN INFO = -2 - ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 )) THEN + ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 ) ) THEN INFO = -3 ELSE IF( NB.LT.0 ) THEN INFO = -4 @@ -209,60 +223,62 @@ INFO = -6 ELSE IF( LDT.LT.MB ) THEN INFO = -8 - ELSE IF( ( LWORK.LT.M*MB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = MB*M +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'DLASWLQ', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The LQ Decomposition * - IF((M.GE.N).OR.(NB.LE.M).OR.(NB.GE.N)) THEN - CALL DGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO) + IF( (M.GE.N) .OR. (NB.LE.M) .OR. (NB.GE.N) ) THEN + CALL DGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO ) RETURN - END IF + END IF * - KK = MOD((N-M),(NB-M)) - II=N-KK+1 + KK = MOD((N-M),(NB-M)) + II = N-KK+1 * -* Compute the LQ factorization of the first block A(1:M,1:NB) +* Compute the LQ factorization of the first block A(1:M,1:NB) * - CALL DGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO) - CTR = 1 + CALL DGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO ) + CTR = 1 * - DO I = NB+1, II-NB+M , (NB-M) + DO I = NB+1, II-NB+M, (NB-M) * -* Compute the QR factorization of the current block A(1:M,I:I+NB-M) +* Compute the QR factorization of the current block A(1:M,I:I+NB-M) * - CALL DTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), - $ LDA, T(1, CTR * M + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CALL DTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), + $ LDA, T(1, CTR * M + 1), + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * * Compute the QR factorization of the last block A(1:M,II:N) * - IF (II.LE.N) THEN + IF( II.LE.N ) THEN CALL DTPLQT( M, KK, 0, MB, A(1,1), LDA, A( 1, II ), - $ LDA, T(1, CTR * M + 1), LDT, - $ WORK, INFO ) - END IF + $ LDA, T(1, CTR * M + 1), LDT, + $ WORK, INFO ) + END IF +* + WORK( 1 ) = LWMIN * - WORK( 1 ) = M * MB RETURN * * End of DLASWLQ diff --git a/lapack-netlib/SRC/dlatrs3.f b/lapack-netlib/SRC/dlatrs3.f index e6d78b672..d18675b2d 100644 --- a/lapack-netlib/SRC/dlatrs3.f +++ b/lapack-netlib/SRC/dlatrs3.f @@ -151,13 +151,17 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (LWORK). +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal size of *> WORK. *> \endverbatim *> *> \param[in] LWORK +*> \verbatim *> LWORK is INTEGER +*> The dimension of the array WORK. +*> +*> If MIN(N,NRHS) = 0, LWORK >= 1, else *> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where *> NBA = (N + NB - 1)/NB and NB is the optimal block size. *> @@ -165,6 +169,7 @@ *> only calculates the optimal dimensions of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. +*> \endverbatim *> *> \param[out] INFO *> \verbatim @@ -181,7 +186,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERauxiliary +*> \ingroup latrs3 *> \par Further Details: * ===================== * \verbatim @@ -253,7 +258,7 @@ LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, - $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS, LWMIN DOUBLE PRECISION ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, $ SCAMIN, SMLNUM, TMAX * .. @@ -292,15 +297,24 @@ * row. WORK( I+KK*LDS ) is the scale factor of the vector * segment associated with the I-th block row and the KK-th vector * in the block column. +* LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) LDS = NBA +* * The second part stores upper bounds of the triangular A. There are * a total of NBA x NBA blocks, of which only the upper triangular * part or the lower triangular part is referenced. The upper bound of * the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). +* LANRM = NBA * NBA AWRK = LSCALE - WORK( 1 ) = LSCALE + LANRM +* + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = LSCALE + LANRM + END IF + WORK( 1 ) = LWMIN * * Test the input parameters * @@ -322,7 +336,7 @@ INFO = -8 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -10 - ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.LWMIN ) THEN INFO = -14 END IF IF( INFO.NE.0 ) THEN @@ -649,6 +663,9 @@ END IF END DO END DO +* + WORK( 1 ) = LWMIN +* RETURN * * End of DLATRS3 diff --git a/lapack-netlib/SRC/dlatsqr.f b/lapack-netlib/SRC/dlatsqr.f index 94a04be02..0000aab68 100644 --- a/lapack-netlib/SRC/dlatsqr.f +++ b/lapack-netlib/SRC/dlatsqr.f @@ -101,15 +101,18 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= NB*N. +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= NB*N, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -161,27 +164,29 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup latsqr +*> * ===================================================================== SUBROUTINE DLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, - $ LWORK, INFO) + $ LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK + INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK * .. * .. Array Arguments .. - DOUBLE PRECISION A( LDA, * ), WORK( * ), T(LDT, *) + DOUBLE PRECISION A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, MINMN, LWMIN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME @@ -198,6 +203,13 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = N*NB + END IF * IF( M.LT.0 ) THEN INFO = -1 @@ -205,65 +217,67 @@ INFO = -2 ELSE IF( MB.LT.1 ) THEN INFO = -3 - ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 )) THEN + ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 ) ) THEN INFO = -4 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -6 ELSE IF( LDT.LT.NB ) THEN INFO = -8 - ELSE IF( LWORK.LT.(N*NB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = NB*N +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN END IF +* IF( INFO.NE.0 ) THEN CALL XERBLA( 'DLATSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The QR Decomposition * - IF ((MB.LE.N).OR.(MB.GE.M)) THEN - CALL DGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO) - RETURN - END IF + IF( (MB.LE.N) .OR. (MB.GE.M) ) THEN + CALL DGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO ) + RETURN + END IF * - KK = MOD((M-N),(MB-N)) - II=M-KK+1 + KK = MOD((M-N),(MB-N)) + II = M-KK+1 * -* Compute the QR factorization of the first block A(1:MB,1:N) +* Compute the QR factorization of the first block A(1:MB,1:N) * - CALL DGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) + CALL DGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) * - CTR = 1 - DO I = MB+1, II-MB+N , (MB-N) + CTR = 1 + DO I = MB+1, II-MB+N, (MB-N) * -* Compute the QR factorization of the current block A(I:I+MB-N,1:N) +* Compute the QR factorization of the current block A(I:I+MB-N,1:N) * - CALL DTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, - $ T(1, CTR * N + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CALL DTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, + $ T(1, CTR * N + 1), + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * -* Compute the QR factorization of the last block A(II:M,1:N) +* Compute the QR factorization of the last block A(II:M,1:N) * - IF (II.LE.M) THEN - CALL DTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, - $ T(1, CTR * N + 1), LDT, - $ WORK, INFO ) - END IF + IF( II.LE.M ) THEN + CALL DTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, + $ T(1, CTR * N + 1), LDT, + $ WORK, INFO ) + END IF * - WORK( 1 ) = N*NB + WORK( 1 ) = LWMIN RETURN * * End of DLATSQR diff --git a/lapack-netlib/SRC/dsyev_2stage.f b/lapack-netlib/SRC/dsyev_2stage.f index 50d51d992..286366bfe 100644 --- a/lapack-netlib/SRC/dsyev_2stage.f +++ b/lapack-netlib/SRC/dsyev_2stage.f @@ -20,7 +20,7 @@ * Definition: * =========== * -* SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, +* SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, * INFO ) * * IMPLICIT NONE @@ -97,7 +97,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension LWORK +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -105,12 +105,12 @@ *> \verbatim *> LWORK is INTEGER *> The length of the array WORK. LWORK >= 1, when N <= 1; -*> otherwise +*> otherwise *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + 2*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + 2*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -143,7 +143,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYeigen +*> \ingroup heev_2stage * *> \par Further Details: * ===================== @@ -161,7 +161,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -169,16 +169,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * * ===================================================================== - SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, + SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, $ INFO ) * IMPLICIT NONE @@ -305,7 +305,7 @@ LLWORK = LWORK - INDWRK + 1 * CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK( INDE ), - $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD, + $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD, $ WORK( INDWRK ), LLWORK, IINFO ) * * For eigenvalues only, call DSTERF. For eigenvectors, first call diff --git a/lapack-netlib/SRC/dsyevd.f b/lapack-netlib/SRC/dsyevd.f index b27f4cdc7..adcfcb373 100644 --- a/lapack-netlib/SRC/dsyevd.f +++ b/lapack-netlib/SRC/dsyevd.f @@ -96,8 +96,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, -*> dimension (LWORK) +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -160,7 +159,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYeigen +*> \ingroup heevd * *> \par Contributors: * ================== diff --git a/lapack-netlib/SRC/dsyevr.f b/lapack-netlib/SRC/dsyevr.f index 698691533..8647b0162 100644 --- a/lapack-netlib/SRC/dsyevr.f +++ b/lapack-netlib/SRC/dsyevr.f @@ -271,7 +271,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,26*N). +*> The dimension of the array WORK. +*> If N <= 1, LWORK >= 1, else LWORK >= 26*N. *> For optimal efficiency, LWORK >= (NB+6)*N, *> where NB is the max of the blocksize for DSYTRD and DORMTR *> returned by ILAENV. @@ -285,13 +286,14 @@ *> \param[out] IWORK *> \verbatim *> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) -*> On exit, if INFO = 0, IWORK(1) returns the optimal LWORK. +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. *> \endverbatim *> *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the IWORK array, @@ -315,7 +317,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYeigen +*> \ingroup heevr * *> \par Contributors: * ================== @@ -390,8 +392,13 @@ * LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LIWORK.EQ.-1 ) ) * - LWMIN = MAX( 1, 26*N ) - LIWMIN = MAX( 1, 10*N ) + IF( N.LE.1 ) THEN + LWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = 26*N + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN @@ -450,7 +457,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 7 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = A( 1, 1 ) diff --git a/lapack-netlib/SRC/dsyevr_2stage.f b/lapack-netlib/SRC/dsyevr_2stage.f index 09242bbd3..63d5e3159 100644 --- a/lapack-netlib/SRC/dsyevr_2stage.f +++ b/lapack-netlib/SRC/dsyevr_2stage.f @@ -263,7 +263,7 @@ *> indicating the nonzero elements in Z. The i-th eigenvector *> is nonzero only in elements ISUPPZ( 2*i-1 ) through *> ISUPPZ( 2*i ). This is an output of DSTEMR (tridiagonal -*> matrix). The support of the eigenvectors of A is typically +*> matrix). The support of the eigenvectors of A is typically *> 1:N because of the orthogonal transformations applied by DORMTR. *> Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1 *> \endverbatim @@ -277,12 +277,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. +*> If N <= 1, LWORK must be at least 1. *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, 26*N, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + 5*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + 5*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -300,13 +301,14 @@ *> \param[out] IWORK *> \verbatim *> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) -*> On exit, if INFO = 0, IWORK(1) returns the optimal LWORK. +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. *> \endverbatim *> *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the IWORK array, @@ -330,7 +332,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYeigen +*> \ingroup heevr_2stage * *> \par Contributors: * ================== @@ -358,7 +360,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -366,11 +368,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -444,8 +446,14 @@ IB = ILAENV2STAGE( 2, 'DSYTRD_2STAGE', JOBZ, N, KD, -1, -1 ) LHTRD = ILAENV2STAGE( 3, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) LWTRD = ILAENV2STAGE( 4, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) - LWMIN = MAX( 26*N, 5*N + LHTRD + LWTRD ) - LIWMIN = MAX( 1, 10*N ) +* + IF( N.LE.1 ) THEN + LWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = MAX( 26*N, 5*N + LHTRD + LWTRD ) + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN @@ -484,7 +492,7 @@ * NB = ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 ) * NB = MAX( NB, ILAENV( 1, 'DORMTR', UPLO, N, -1, -1, -1 ) ) * LWKOPT = MAX( ( NB+1 )*N, LWMIN ) - WORK( 1 ) = LWMIN + WORK( 1 ) = LWMIN IWORK( 1 ) = LIWMIN END IF * @@ -504,7 +512,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 7 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = A( 1, 1 ) @@ -608,7 +616,7 @@ * Call DSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form. * * - CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ), + CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ), $ WORK( INDE ), WORK( INDTAU ), WORK( INDHOUS ), $ LHTRD, WORK( INDWK ), LLWORK, IINFO ) * @@ -727,7 +735,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWMIN + WORK( 1 ) = LWMIN IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/dsyevx.f b/lapack-netlib/SRC/dsyevx.f index 99719874b..fd6a78e32 100644 --- a/lapack-netlib/SRC/dsyevx.f +++ b/lapack-netlib/SRC/dsyevx.f @@ -244,7 +244,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYeigen +*> \ingroup heevx * * ===================================================================== SUBROUTINE DSYEVX( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU, IL, IU, @@ -338,14 +338,14 @@ IF( INFO.EQ.0 ) THEN IF( N.LE.1 ) THEN LWKMIN = 1 - WORK( 1 ) = LWKMIN + LWKOPT = 1 ELSE LWKMIN = 8*N NB = ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'DORMTR', UPLO, N, -1, -1, -1 ) ) LWKOPT = MAX( LWKMIN, ( NB + 3 )*N ) - WORK( 1 ) = LWKOPT END IF + WORK( 1 ) = LWKOPT * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) $ INFO = -17 diff --git a/lapack-netlib/SRC/dsysv_aa.f b/lapack-netlib/SRC/dsysv_aa.f index 8dab5a384..0a96ecd7e 100644 --- a/lapack-netlib/SRC/dsysv_aa.f +++ b/lapack-netlib/SRC/dsysv_aa.f @@ -154,7 +154,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYsolve +*> \ingroup hesv_aa * * ===================================================================== SUBROUTINE DSYSV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, @@ -177,7 +177,7 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER LWKOPT, LWKOPT_SYTRF, LWKOPT_SYTRS + INTEGER LWKMIN, LWKOPT, LWKOPT_SYTRF, LWKOPT_SYTRS * .. * .. External Functions .. LOGICAL LSAME @@ -196,6 +196,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N, 3*N-2 ) IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -206,17 +207,17 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX(2*N, 3*N-2) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF * IF( INFO.EQ.0 ) THEN CALL DSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT_SYTRF = INT( WORK(1) ) + LWKOPT_SYTRF = INT( WORK( 1 ) ) CALL DSYTRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, $ -1, INFO ) - LWKOPT_SYTRS = INT( WORK(1) ) - LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS ) + LWKOPT_SYTRS = INT( WORK( 1 ) ) + LWKOPT = MAX( LWKMIN, LWKOPT_SYTRF, LWKOPT_SYTRS ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/dsysv_aa_2stage.f b/lapack-netlib/SRC/dsysv_aa_2stage.f index 72fbe1e9a..90dd0a38a 100644 --- a/lapack-netlib/SRC/dsysv_aa_2stage.f +++ b/lapack-netlib/SRC/dsysv_aa_2stage.f @@ -101,14 +101,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is DOUBLE PRECISION array, dimension (LTB) +*> TB is DOUBLE PRECISION array, dimension (MAX(1,LTB)) *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -148,14 +148,15 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION workspace of size LWORK +*> WORK is DOUBLE PRECISION workspace of size (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used to +*> select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -179,7 +180,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYsolve +*> \ingroup hesv_aa_2stage * * ===================================================================== SUBROUTINE DSYSV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB, @@ -205,7 +206,7 @@ * * .. Local Scalars .. LOGICAL UPPER, TQUERY, WQUERY - INTEGER LWKOPT + INTEGER LWKMIN, LWKOPT * .. * .. External Functions .. LOGICAL LSAME @@ -226,6 +227,7 @@ UPPER = LSAME( UPLO, 'U' ) WQUERY = ( LWORK.EQ.-1 ) TQUERY = ( LTB.EQ.-1 ) + LWKMIN = MAX( 1, N ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -234,18 +236,19 @@ INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 - ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 - ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.WQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL DSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) - LWKOPT = INT( WORK(1) ) + LWKOPT = MAX( LWKMIN, INT( WORK( 1 ) ) ) + WORK( 1 ) = LWKOPT END IF * IF( INFO.NE.0 ) THEN @@ -255,7 +258,6 @@ RETURN END IF * -* * Compute the factorization A = U**T*T*U or A = L*T*L**T. * CALL DSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2, diff --git a/lapack-netlib/SRC/dsysvx.f b/lapack-netlib/SRC/dsysvx.f index a30831e72..b2b8210ca 100644 --- a/lapack-netlib/SRC/dsysvx.f +++ b/lapack-netlib/SRC/dsysvx.f @@ -275,7 +275,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYsolve +*> \ingroup hesvx * * ===================================================================== SUBROUTINE DSYSVX( FACT, UPLO, N, NRHS, A, LDA, AF, LDAF, IPIV, B, @@ -305,7 +305,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, NOFACT - INTEGER LWKOPT, NB + INTEGER LWKMIN, LWKOPT, NB DOUBLE PRECISION ANORM * .. * .. External Functions .. @@ -327,6 +327,7 @@ INFO = 0 NOFACT = LSAME( FACT, 'N' ) LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 3*N ) IF( .NOT.NOFACT .AND. .NOT.LSAME( FACT, 'F' ) ) THEN INFO = -1 ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) @@ -344,12 +345,12 @@ INFO = -11 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -13 - ELSE IF( LWORK.LT.MAX( 1, 3*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -18 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = MAX( 1, 3*N ) + LWKOPT = LWKMIN IF( NOFACT ) THEN NB = ILAENV( 1, 'DSYTRF', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( LWKOPT, N*NB ) diff --git a/lapack-netlib/SRC/dsytrd.f b/lapack-netlib/SRC/dsytrd.f index 3dcfc3db2..58d4b633b 100644 --- a/lapack-netlib/SRC/dsytrd.f +++ b/lapack-netlib/SRC/dsytrd.f @@ -139,7 +139,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrd * *> \par Further Details: * ===================== @@ -247,7 +247,7 @@ * Determine the block size. * NB = ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/dsytrd_2stage.f b/lapack-netlib/SRC/dsytrd_2stage.f index 8ae77d3e4..a88ac1c73 100644 --- a/lapack-netlib/SRC/dsytrd_2stage.f +++ b/lapack-netlib/SRC/dsytrd_2stage.f @@ -4,23 +4,23 @@ * * =========== DOCUMENTATION =========== * -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ * *> \htmlonly -*> Download DSYTRD_2STAGE + dependencies -*> -*> [TGZ] -*> -*> [ZIP] -*> +*> Download DSYTRD_2STAGE + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> *> [TXT] -*> \endhtmlonly +*> \endhtmlonly * * Definition: * =========== * -* SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, +* SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, * HOUS2, LHOUS2, WORK, LWORK, INFO ) * * IMPLICIT NONE @@ -34,7 +34,7 @@ * DOUBLE PRECISION A( LDA, * ), TAU( * ), * HOUS2( * ), WORK( * ) * .. -* +* * *> \par Purpose: * ============= @@ -52,11 +52,11 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> in particular for the second stage (Band to *> tridiagonal) and thus LHOUS2 is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate Q1 Q2 or to apply Q1 Q2, +*> = 'V': the Householder representation is needed to +*> either generate Q1 Q2 or to apply Q1 Q2, *> then LHOUS2 is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -86,7 +86,7 @@ *> triangular part of A is not referenced. *> On exit, if UPLO = 'U', the band superdiagonal *> of A are overwritten by the corresponding elements of the -*> internal band-diagonal matrix AB, and the elements above +*> internal band-diagonal matrix AB, and the elements above *> the KD superdiagonal, with the array TAU, represent the orthogonal *> matrix Q1 as a product of elementary reflectors; if UPLO *> = 'L', the diagonal and band subdiagonal of A are over- @@ -117,13 +117,13 @@ *> \param[out] TAU *> \verbatim *> TAU is DOUBLE PRECISION array, dimension (N-KD) -*> The scalar factors of the elementary reflectors of +*> The scalar factors of the elementary reflectors of *> the first stage (see Further Details). *> \endverbatim *> *> \param[out] HOUS2 *> \verbatim -*> HOUS2 is DOUBLE PRECISION array, dimension (LHOUS2) +*> HOUS2 is DOUBLE PRECISION array, dimension (MAX(1,LHOUS2)) *> Stores the Householder representation of the stage2 *> band to tridiagonal. *> \endverbatim @@ -132,6 +132,8 @@ *> \verbatim *> LHOUS2 is INTEGER *> The dimension of the array HOUS2. +*> LHOUS2 >= 1. +*> *> If LWORK = -1, or LHOUS2 = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS2 array, returns @@ -143,23 +145,26 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (LWORK) +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS2=-1, +*> The dimension of the array WORK. +*> If N = 0, LWORK >= 1, else LWORK = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS2 = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> LWORK = MAX(1, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) -*> + (KD+1)*N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ *> algorithm, usually FACTOPTNB=128 is a good choice @@ -177,12 +182,12 @@ * Authors: * ======== * -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrd_2stage * *> \par Further Details: * ===================== @@ -202,7 +207,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -210,16 +215,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, + SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, $ HOUS2, LHOUS2, WORK, LWORK, INFO ) * IMPLICIT NONE @@ -265,10 +270,13 @@ * KD = ILAENV2STAGE( 1, 'DSYTRD_2STAGE', VECT, N, -1, -1, -1 ) IB = ILAENV2STAGE( 2, 'DSYTRD_2STAGE', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 ) -* WRITE(*,*),'DSYTRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO, -* $ LHMIN, LWMIN + IF( N.EQ.0 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.LSAME( VECT, 'N' ) ) THEN INFO = -1 @@ -309,14 +317,14 @@ LWRK = LWORK-LDAB*N ABPOS = 1 WPOS = ABPOS + LDAB*N - CALL DSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, + CALL DSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, $ TAU, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'DSYTRD_SY2SB', -INFO ) RETURN END IF - CALL DSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD, - $ WORK( ABPOS ), LDAB, D, E, + CALL DSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD, + $ WORK( ABPOS ), LDAB, D, E, $ HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'DSYTRD_SB2ST', -INFO ) @@ -324,8 +332,7 @@ END IF * * - HOUS2( 1 ) = LHMIN - WORK( 1 ) = LWMIN + WORK( 1 ) = LWMIN RETURN * * End of DSYTRD_2STAGE diff --git a/lapack-netlib/SRC/dsytrd_sb2st.F b/lapack-netlib/SRC/dsytrd_sb2st.F index bb74dd491..04d03d587 100644 --- a/lapack-netlib/SRC/dsytrd_sb2st.F +++ b/lapack-netlib/SRC/dsytrd_sb2st.F @@ -18,7 +18,7 @@ * Definition: * =========== * -* SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, +* SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, * D, E, HOUS, LHOUS, WORK, LWORK, INFO ) * * #if defined(_OPENMP) @@ -53,12 +53,12 @@ *> \param[in] STAGE1 *> \verbatim *> STAGE1 is CHARACTER*1 -*> = 'N': "No": to mention that the stage 1 of the reduction +*> = 'N': "No": to mention that the stage 1 of the reduction *> from dense to band using the dsytrd_sy2sb routine -*> was not called before this routine to reproduce AB. -*> In other term this routine is called as standalone. -*> = 'Y': "Yes": to mention that the stage 1 of the -*> reduction from dense to band using the dsytrd_sy2sb +*> was not called before this routine to reproduce AB. +*> In other term this routine is called as standalone. +*> = 'Y': "Yes": to mention that the stage 1 of the +*> reduction from dense to band using the dsytrd_sy2sb *> routine has been called to produce AB (e.g., AB is *> the output of dsytrd_sy2sb. *> \endverbatim @@ -66,10 +66,10 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> and thus LHOUS is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate or to apply Q later on, +*> = 'V': the Householder representation is needed to +*> either generate or to apply Q later on, *> then LHOUS is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -132,34 +132,39 @@ *> *> \param[out] HOUS *> \verbatim -*> HOUS is DOUBLE PRECISION array, dimension LHOUS, that -*> store the Householder representation. +*> HOUS is DOUBLE PRECISION array, dimension (MAX(1,LHOUS)) +*> Stores the Householder representation. *> \endverbatim *> *> \param[in] LHOUS *> \verbatim *> LHOUS is INTEGER -*> The dimension of the array HOUS. LHOUS = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array HOUS. +*> If N = 0 or KD <= 1, LHOUS >= 1, else LHOUS = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS array, returns *> this value as the first entry of the HOUS array, and no error *> message related to LHOUS is issued by XERBLA. *> LHOUS = MAX(1, dimension) where *> dimension = 4*N if VECT='N' -*> not available now if VECT='H' +*> not available now if VECT='H' *> \endverbatim *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension LWORK. +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array WORK. +*> If N = 0 or KD <= 1, LWORK >= 1, else LWORK = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -188,7 +193,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup real16OTHERcomputational +*> \ingroup hetrd_hb2st * *> \par Further Details: * ===================== @@ -208,7 +213,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -216,16 +221,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, + SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, $ D, E, HOUS, LHOUS, WORK, LWORK, INFO ) * #if defined(_OPENMP) @@ -258,11 +263,11 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, WANTQ, UPPER, AFTERS1 - INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, + INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, $ ED, STIND, EDIND, BLKLASTIND, COLPT, THED, $ STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID, - $ NBTILES, TTYPE, TID, NTHREADS, DEBUG, - $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, + $ NBTILES, TTYPE, TID, NTHREADS, + $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, $ INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU, $ SIDEV, SIZETAU, LDV, LHMIN, LWMIN * .. @@ -274,7 +279,7 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV2STAGE + INTEGER ILAENV2STAGE EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. @@ -282,7 +287,6 @@ * Determine the minimal workspace size required. * Test the input parameters * - DEBUG = 0 INFO = 0 AFTERS1 = LSAME( STAGE1, 'Y' ) WANTQ = LSAME( VECT, 'V' ) @@ -291,9 +295,14 @@ * * Determine the block size, the workspace size and the hous size. * - IB = ILAENV2STAGE( 2, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + IB = ILAENV2STAGE( 2, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 ) + IF( N.EQ.0 .OR. KD.LE.1 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN INFO = -1 @@ -355,7 +364,7 @@ ABDPOS = KD + 1 ABOFDPOS = KD ELSE - APOS = INDA + APOS = INDA AWPOS = INDA + KD + 1 DPOS = APOS OFDPOS = DPOS + 1 @@ -363,11 +372,11 @@ ABOFDPOS = 2 ENDIF -* -* Case KD=0: -* The matrix is diagonal. We just copy it (convert to "real" for -* real because D is double and the imaginary part should be 0) -* and store it in D. A sequential code here is better or +* +* Case KD=0: +* The matrix is diagonal. We just copy it (convert to "real" for +* real because D is double and the imaginary part should be 0) +* and store it in D. A sequential code here is better or * in a parallel environment it might need two cores for D and E * IF( KD.EQ.0 ) THEN @@ -382,17 +391,17 @@ WORK( 1 ) = 1 RETURN END IF -* -* Case KD=1: -* The matrix is already Tridiagonal. We have to make diagonal +* +* Case KD=1: +* The matrix is already Tridiagonal. We have to make diagonal * and offdiagonal elements real, and store them in D and E. -* For that, for real precision just copy the diag and offdiag -* to D and E while for the COMPLEX case the bulge chasing is -* performed to convert the hermetian tridiagonal to symmetric -* tridiagonal. A simpler conversion formula might be used, but then +* For that, for real precision just copy the diag and offdiag +* to D and E while for the COMPLEX case the bulge chasing is +* performed to convert the hermetian tridiagonal to symmetric +* tridiagonal. A simpler conversion formula might be used, but then * updating the Q matrix will be required and based if Q is generated -* or not this might complicate the story. -* +* or not this might complicate the story. +* IF( KD.EQ.1 ) THEN DO 50 I = 1, N D( I ) = ( AB( ABDPOS, I ) ) @@ -413,7 +422,7 @@ RETURN END IF * -* Main code start here. +* Main code start here. * Reduce the symmetric band of A to a tridiagonal matrix. * THGRSIZ = N @@ -422,7 +431,7 @@ NBTILES = CEILING( REAL(N)/REAL(KD) ) STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) ) THGRNB = CEILING( REAL(N-1)/REAL(THGRSIZ) ) -* +* CALL DLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA ) CALL DLASET( "A", KD, N, ZERO, ZERO, WORK( AWPOS ), LDA ) * @@ -431,7 +440,7 @@ * #if defined(_OPENMP) !$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND ) -!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) +!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) !$OMP$ PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND ) !$OMP$ SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK) !$OMP$ SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA ) @@ -440,7 +449,7 @@ #endif * * main bulge chasing loop -* +* DO 100 THGRID = 1, THGRNB STT = (THGRID-1)*THGRSIZ+1 THED = MIN( (STT + THGRSIZ -1), (N-1)) @@ -451,7 +460,7 @@ ST = STT DO 130 SWEEPID = ST, ED DO 140 K = 1, GRSIZ - MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ) + MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ) $ + (M-1)*GRSIZ + K IF ( MYID.EQ.1 ) THEN TTYPE = 1 @@ -477,16 +486,16 @@ ENDIF * * Call the kernel -* +* #if defined(_OPENMP) && _OPENMP >= 201307 - IF( TTYPE.NE.1 ) THEN + IF( TTYPE.NE.1 ) THEN !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(in:WORK(MYID-1)) !$OMP$ DEPEND(out:WORK(MYID)) TID = OMP_GET_THREAD_NUM() - CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW + TID*KD ) ) !$OMP END TASK @@ -494,20 +503,20 @@ !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(out:WORK(MYID)) TID = OMP_GET_THREAD_NUM() - CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW + TID*KD ) ) !$OMP END TASK ENDIF #else - CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW ) ) -#endif +#endif IF ( BLKLASTIND.GE.(N-1) ) THEN STT = STT + 1 EXIT @@ -522,14 +531,14 @@ !$OMP END MASTER !$OMP END PARALLEL #endif -* +* * Copy the diagonal from A to D. Note that D is REAL thus only * the Real part is needed, the imaginary part should be zero. * DO 150 I = 1, N D( I ) = ( WORK( DPOS+(I-1)*LDA ) ) 150 CONTINUE -* +* * Copy the off diagonal from A to E. Note that E is REAL thus only * the Real part is needed, the imaginary part should be zero. * @@ -543,11 +552,10 @@ 170 CONTINUE ENDIF * - HOUS( 1 ) = LHMIN WORK( 1 ) = LWMIN RETURN * * End of DSYTRD_SB2ST * END - + diff --git a/lapack-netlib/SRC/dsytrd_sy2sb.f b/lapack-netlib/SRC/dsytrd_sy2sb.f index 1660b5c7e..38acc71f1 100644 --- a/lapack-netlib/SRC/dsytrd_sy2sb.f +++ b/lapack-netlib/SRC/dsytrd_sy2sb.f @@ -123,8 +123,8 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (LWORK) -*> On exit, if INFO = 0, or if LWORK=-1, +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, or if LWORK = -1, *> WORK(1) returns the size of LWORK. *> \endverbatim *> @@ -132,7 +132,9 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK which should be calculated -*> by a workspace query. LWORK = MAX(1, LWORK_QUERY) +*> by a workspace query. +*> If N <= KD+1, LWORK >= 1, else LWORK = MAX(1, LWORK_QUERY) +*> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -158,7 +160,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrd_he2hb * *> \par Further Details: * ===================== @@ -293,8 +295,12 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) - LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SY2SB', '', N, KD, -1, -1 ) - + IF( N.LE.KD+1 ) THEN + LWMIN = 1 + ELSE + LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SY2SB', ' ', N, KD, -1, -1 ) + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN diff --git a/lapack-netlib/SRC/dsytrf.f b/lapack-netlib/SRC/dsytrf.f index aee9b3f6a..2a1a2d4dc 100644 --- a/lapack-netlib/SRC/dsytrf.f +++ b/lapack-netlib/SRC/dsytrf.f @@ -107,7 +107,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine @@ -135,7 +135,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrf * *> \par Further Details: * ===================== @@ -352,6 +352,7 @@ END IF * 40 CONTINUE +* WORK( 1 ) = LWKOPT RETURN * diff --git a/lapack-netlib/SRC/dsytrf_aa.f b/lapack-netlib/SRC/dsytrf_aa.f index 9a0b26ce5..924d4c165 100644 --- a/lapack-netlib/SRC/dsytrf_aa.f +++ b/lapack-netlib/SRC/dsytrf_aa.f @@ -101,8 +101,10 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= MAX(1,2*N). For optimum performance -*> LWORK >= N*(1+NB), where NB is the optimal blocksize. +*> The length of WORK. +*> LWORK >= 1, if N <= 1, and LWORK >= 2*N, otherwise. +*> For optimum performance LWORK >= N*(1+NB), where NB is +*> the optimal blocksize, returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -125,10 +127,10 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrf_aa * * ===================================================================== - SUBROUTINE DSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO) + SUBROUTINE DSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -152,7 +154,7 @@ * * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER J, LWKOPT + INTEGER J, LWKMIN, LWKOPT INTEGER NB, MJ, NJ, K1, K2, J1, J2, J3, JB DOUBLE PRECISION ALPHA * .. @@ -179,18 +181,25 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( N.LE.1 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = 2*N + LWKOPT = (NB+1)*N + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = (NB+1)*N WORK( 1 ) = LWKOPT END IF * @@ -203,11 +212,11 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF IPIV( 1 ) = 1 - IF ( N.EQ.1 ) THEN + IF( N.EQ.1 ) THEN RETURN END IF * diff --git a/lapack-netlib/SRC/dsytrf_aa_2stage.f b/lapack-netlib/SRC/dsytrf_aa_2stage.f index c65bd86e6..fae95bab2 100644 --- a/lapack-netlib/SRC/dsytrf_aa_2stage.f +++ b/lapack-netlib/SRC/dsytrf_aa_2stage.f @@ -87,14 +87,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is DOUBLE PRECISION array, dimension (LTB) +*> TB is DOUBLE PRECISION array, dimension (MAX(1,LTB)) *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -121,14 +121,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION workspace of size LWORK +*> WORK is DOUBLE PRECISION workspace of size (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used +*> to select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -152,7 +152,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrf_aa_2stage * * ===================================================================== SUBROUTINE DSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, @@ -211,9 +211,9 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LTB .LT. 4*N .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -6 - ELSE IF ( LWORK .LT. N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.WQUERY ) THEN INFO = -10 END IF * @@ -227,10 +227,10 @@ NB = ILAENV( 1, 'DSYTRF_AA_2STAGE', UPLO, N, -1, -1, -1 ) IF( INFO.EQ.0 ) THEN IF( TQUERY ) THEN - TB( 1 ) = (3*NB+1)*N + TB( 1 ) = MAX( 1, (3*NB+1)*N ) END IF IF( WQUERY ) THEN - WORK( 1 ) = N*NB + WORK( 1 ) = MAX( 1, N*NB ) END IF END IF IF( TQUERY .OR. WQUERY ) THEN @@ -239,7 +239,7 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF * diff --git a/lapack-netlib/SRC/dsytrf_rk.f b/lapack-netlib/SRC/dsytrf_rk.f index 086586968..0717eb076 100644 --- a/lapack-netlib/SRC/dsytrf_rk.f +++ b/lapack-netlib/SRC/dsytrf_rk.f @@ -177,14 +177,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension ( MAX(1,LWORK) ). +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned *> by ILAENV. *> @@ -229,7 +229,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrf_rk * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/dsytrf_rook.f b/lapack-netlib/SRC/dsytrf_rook.f index 2f00d1802..316663485 100644 --- a/lapack-netlib/SRC/dsytrf_rook.f +++ b/lapack-netlib/SRC/dsytrf_rook.f @@ -118,7 +118,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine @@ -146,7 +146,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrf_rook * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/dsytri2.f b/lapack-netlib/SRC/dsytri2.f index dbcdcdb58..5960d3992 100644 --- a/lapack-netlib/SRC/dsytri2.f +++ b/lapack-netlib/SRC/dsytri2.f @@ -88,16 +88,16 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (N+NB+1)*(NB+3) +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> WORK is size >= (N+NB+1)*(NB+3) +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> If LWORK = -1, then a workspace query is assumed; the routine -*> calculates: +*> calculates: *> - the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, *> - and no error message related to LWORK is issued by XERBLA. @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetri2 * * ===================================================================== SUBROUTINE DSYTRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -159,9 +159,13 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) +* * Get blocksize +* NBMAX = ILAENV( 1, 'DSYTRI2', UPLO, N, -1, -1, -1 ) - IF ( NBMAX .GE. N ) THEN + IF( N.EQ.0 ) THEN + MINSIZE = 1 + ELSE IF( NBMAX.GE.N ) THEN MINSIZE = N ELSE MINSIZE = (N+NBMAX+1)*(NBMAX+3) @@ -173,28 +177,29 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF (LWORK .LT. MINSIZE .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.MINSIZE .AND. .NOT.LQUERY ) THEN INFO = -7 END IF -* -* Quick return if possible -* * IF( INFO.NE.0 ) THEN CALL XERBLA( 'DSYTRI2', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK(1)=MINSIZE + WORK( 1 ) = MINSIZE RETURN END IF +* +* Quick return if possible +* IF( N.EQ.0 ) $ RETURN - IF( NBMAX .GE. N ) THEN + IF( NBMAX.GE.N ) THEN CALL DSYTRI( UPLO, N, A, LDA, IPIV, WORK, INFO ) ELSE CALL DSYTRI2X( UPLO, N, A, LDA, IPIV, WORK, NBMAX, INFO ) END IF +* RETURN * * End of DSYTRI2 diff --git a/lapack-netlib/SRC/dsytri_3.f b/lapack-netlib/SRC/dsytri_3.f index 86d69cdfd..50834c605 100644 --- a/lapack-netlib/SRC/dsytri_3.f +++ b/lapack-netlib/SRC/dsytri_3.f @@ -119,16 +119,17 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (N+NB+1)*(NB+3). +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= (N+NB+1)*(NB+3). +*> The length of WORK. +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> -*> If LDWORK = -1, then a workspace query is assumed; +*> If LWORK = -1, then a workspace query is assumed; *> the routine only calculates the optimal size of the optimal *> size of the WORK array, returns this value as the first *> entry of the WORK array, and no error message related to @@ -152,7 +153,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetri_3 * *> \par Contributors: * ================== @@ -208,8 +209,13 @@ * * Determine the block size * - NB = MAX( 1, ILAENV( 1, 'DSYTRI_3', UPLO, N, -1, -1, -1 ) ) - LWKOPT = ( N+NB+1 ) * ( NB+3 ) + IF( N.EQ.0 ) THEN + LWKOPT = 1 + ELSE + NB = MAX( 1, ILAENV( 1, 'DSYTRI_3', UPLO, N, -1, -1, -1 ) ) + LWKOPT = ( N+NB+1 ) * ( NB+3 ) + END IF + WORK( 1 ) = LWKOPT * IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 @@ -217,7 +223,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LWORK .LT. LWKOPT .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN INFO = -8 END IF * @@ -225,7 +231,6 @@ CALL XERBLA( 'DSYTRI_3', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = LWKOPT RETURN END IF * diff --git a/lapack-netlib/SRC/dsytrs_aa.f b/lapack-netlib/SRC/dsytrs_aa.f index 26b11a2a0..f0016cb7f 100644 --- a/lapack-netlib/SRC/dsytrs_aa.f +++ b/lapack-netlib/SRC/dsytrs_aa.f @@ -105,7 +105,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,3*N-2). +*> The dimension of the array WORK. +*> If MIN(N,NRHS) = 0, LWORK >= 1, else LWORK >= 3*N-2. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the minimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. *> \endverbatim *> *> \param[out] INFO @@ -123,7 +129,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrs_aa * * ===================================================================== SUBROUTINE DSYTRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, @@ -151,7 +157,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER K, KP, LWKOPT + INTEGER K, KP, LWKMIN * .. * .. External Functions .. LOGICAL LSAME @@ -161,13 +167,19 @@ EXTERNAL DLACPY, DGTSV, DSWAP, DTRSM, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX + INTRINSIC MIN, MAX * .. * .. Executable Statements .. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWKMIN = 1 + ELSE + LWKMIN = 3*N-2 + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -178,21 +190,20 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX( 1, 3*N-2 ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DSYTRS_AA', -INFO ) RETURN ELSE IF( LQUERY ) THEN - LWKOPT = (3*N-2) - WORK( 1 ) = LWKOPT + WORK( 1 ) = LWKMIN RETURN END IF * * Quick return if possible * - IF( N.EQ.0 .OR. NRHS.EQ.0 ) + IF( MIN( N, NRHS ).EQ.0 ) $ RETURN * IF( UPPER ) THEN diff --git a/lapack-netlib/SRC/sbbcsd.f b/lapack-netlib/SRC/sbbcsd.f index d5720cb33..2a619cb71 100644 --- a/lapack-netlib/SRC/sbbcsd.f +++ b/lapack-netlib/SRC/sbbcsd.f @@ -805,7 +805,7 @@ CALL SLARTGP( B22BULGE, B22E(I-1), WORK(IU2SN+I-1), $ WORK(IU2CS+I-1), R ) ELSE IF( NU .LT. MU ) THEN - CALL SLARTGS( B21E(I), B21E(I+1), NU, WORK(IU2CS+I-1), + CALL SLARTGS( B21E(I), B21D(I+1), NU, WORK(IU2CS+I-1), $ WORK(IU2SN+I-1) ) ELSE CALL SLARTGS( B22D(I), B22E(I), MU, WORK(IU2CS+I-1), diff --git a/lapack-netlib/SRC/sgebrd.f b/lapack-netlib/SRC/sgebrd.f index 2d0c6d651..b33ad0b1f 100644 --- a/lapack-netlib/SRC/sgebrd.f +++ b/lapack-netlib/SRC/sgebrd.f @@ -122,7 +122,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= max(1,M,N). +*> The length of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(M,N), otherwise. *> For optimum performance LWORK >= (M+N)*NB, where NB *> is the optimal blocksize. *> @@ -223,8 +224,8 @@ * .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKOPT, MINMN, NB, - $ NBMIN, NX, WS + INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKMIN, LWKOPT, + $ MINMN, NB, NBMIN, NX, WS * .. * .. External Subroutines .. EXTERNAL SGEBD2, SGEMM, SLABRD, XERBLA @@ -242,9 +243,16 @@ * Test the input parameters * INFO = 0 - NB = MAX( 1, ILAENV( 1, 'SGEBRD', ' ', M, N, -1, -1 ) ) - LWKOPT = ( M+N )*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = MAX( M, N ) + NB = MAX( 1, ILAENV( 1, 'SGEBRD', ' ', M, N, -1, -1 ) ) + LWKOPT = ( M+N )*NB + ENDIF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -252,7 +260,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.LT.0 ) THEN @@ -264,7 +272,6 @@ * * Quick return if possible * - MINMN = MIN( M, N ) IF( MINMN.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -283,7 +290,7 @@ * Determine when to switch from blocked to unblocked code. * IF( NX.LT.MINMN ) THEN - WS = ( M+N )*NB + WS = LWKOPT IF( LWORK.LT.WS ) THEN * * Not enough work space for the optimal NB, consider using @@ -342,7 +349,8 @@ * CALL SGEBD2( M-I+1, N-I+1, A( I, I ), LDA, D( I ), E( I ), $ TAUQ( I ), TAUP( I ), WORK, IINFO ) - WORK( 1 ) = SROUNDUP_LWORK(WS) +* + WORK( 1 ) = SROUNDUP_LWORK( WS ) RETURN * * End of SGEBRD diff --git a/lapack-netlib/SRC/sgedmd.f90 b/lapack-netlib/SRC/sgedmd.f90 index 49cb11527..4860e8898 100644 --- a/lapack-netlib/SRC/sgedmd.f90 +++ b/lapack-netlib/SRC/sgedmd.f90 @@ -1,423 +1,573 @@ - SUBROUTINE SGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & - M, N, X, LDX, Y, LDY, NRNK, TOL, & - K, REIG, IMEIG, Z, LDZ, RES, & - B, LDB, W, LDW, S, LDS, & - WORK, LWORK, IWORK, LIWORK, INFO ) -! March 2023 +!> \brief \b SGEDMD computes the Dynamic Mode Decomposition (DMD) for a pair of data snapshot matrices. +! +! =========== DOCUMENTATION =========== +! +! Definition: +! =========== +! +! SUBROUTINE SGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & +! M, N, X, LDX, Y, LDY, NRNK, TOL, & +! K, REIG, IMEIG, Z, LDZ, RES, & +! B, LDB, W, LDW, S, LDS, & +! WORK, LWORK, IWORK, LIWORK, INFO ) !..... - USE iso_fortran_env - IMPLICIT NONE - INTEGER, PARAMETER :: WP = real32 +! USE iso_fortran_env +! IMPLICIT NONE +! INTEGER, PARAMETER :: WP = real32 !..... ! Scalar arguments - CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF - INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & - NRNK, LDZ, LDB, LDW, LDS, & - LWORK, LIWORK - INTEGER, INTENT(OUT) :: K, INFO - REAL(KIND=WP), INTENT(IN) :: TOL +! CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF +! INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & +! NRNK, LDZ, LDB, LDW, LDS, & +! LWORK, LIWORK +! INTEGER, INTENT(OUT) :: K, INFO +! REAL(KIND=WP), INTENT(IN) :: TOL ! Array arguments - REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) - REAL(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & - W(LDW,*), S(LDS,*) - REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & - RES(*) - REAL(KIND=WP), INTENT(OUT) :: WORK(*) - INTEGER, INTENT(OUT) :: IWORK(*) -!............................................................ -! Purpose -! ======= -! SGEDMD computes the Dynamic Mode Decomposition (DMD) for -! a pair of data snapshot matrices. For the input matrices -! X and Y such that Y = A*X with an unaccessible matrix -! A, SGEDMD computes a certain number of Ritz pairs of A using -! the standard Rayleigh-Ritz extraction from a subspace of -! range(X) that is determined using the leading left singular -! vectors of X. Optionally, SGEDMD returns the residuals -! of the computed Ritz pairs, the information needed for -! a refinement of the Ritz vectors, or the eigenvectors of -! the Exact DMD. -! For further details see the references listed -! below. For more details of the implementation see [3]. -! -! References -! ========== -! [1] P. Schmid: Dynamic mode decomposition of numerical -! and experimental data, -! Journal of Fluid Mechanics 656, 5-28, 2010. -! [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal -! decompositions: analysis and enhancements, -! SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. -! [3] Z. Drmac: A LAPACK implementation of the Dynamic -! Mode Decomposition I. Technical report. AIMDyn Inc. -! and LAPACK Working Note 298. -! [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. -! Brunton, N. Kutz: On Dynamic Mode Decomposition: -! Theory and Applications, Journal of Computational -! Dynamics 1(2), 391 -421, 2014. +! REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) +! REAL(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & +! W(LDW,*), S(LDS,*) +! REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & +! RES(*) +! REAL(KIND=WP), INTENT(OUT) :: WORK(*) +! INTEGER, INTENT(OUT) :: IWORK(*) ! +!............................................................ +!> \par Purpose: +! ============= +!> \verbatim +!> SGEDMD computes the Dynamic Mode Decomposition (DMD) for +!> a pair of data snapshot matrices. For the input matrices +!> X and Y such that Y = A*X with an unaccessible matrix +!> A, SGEDMD computes a certain number of Ritz pairs of A using +!> the standard Rayleigh-Ritz extraction from a subspace of +!> range(X) that is determined using the leading left singular +!> vectors of X. Optionally, SGEDMD returns the residuals +!> of the computed Ritz pairs, the information needed for +!> a refinement of the Ritz vectors, or the eigenvectors of +!> the Exact DMD. +!> For further details see the references listed +!> below. For more details of the implementation see [3]. +!> \endverbatim +!............................................................ +!> \par References: +! ================ +!> \verbatim +!> [1] P. Schmid: Dynamic mode decomposition of numerical +!> and experimental data, +!> Journal of Fluid Mechanics 656, 5-28, 2010. +!> [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal +!> decompositions: analysis and enhancements, +!> SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. +!> [3] Z. Drmac: A LAPACK implementation of the Dynamic +!> Mode Decomposition I. Technical report. AIMDyn Inc. +!> and LAPACK Working Note 298. +!> [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. +!> Brunton, N. Kutz: On Dynamic Mode Decomposition: +!> Theory and Applications, Journal of Computational +!> Dynamics 1(2), 391 -421, 2014. +!> \endverbatim !...................................................................... -! Developed and supported by: -! =========================== -! Developed and coded by Zlatko Drmac, Faculty of Science, -! University of Zagreb; drmac@math.hr -! In cooperation with -! AIMdyn Inc., Santa Barbara, CA. -! and supported by -! - DARPA SBIR project "Koopman Operator-Based Forecasting -! for Nonstationary Processes from Near-Term, Limited -! Observational Data" Contract No: W31P4Q-21-C-0007 -! - DARPA PAI project "Physics-Informed Machine Learning -! Methodologies" Contract No: HR0011-18-9-0033 -! - DARPA MoDyL project "A Data-Driven, Operator-Theoretic -! Framework for Space-Time Analysis of Process Dynamics" -! Contract No: HR0011-16-C-0116 -! Any opinions, findings and conclusions or recommendations -! expressed in this material are those of the author and -! do not necessarily reflect the views of the DARPA SBIR -! Program Office -!============================================================ -! Distribution Statement A: -! Approved for Public Release, Distribution Unlimited. -! Cleared by DARPA on September 29, 2022 -!============================================================ +!> \par Developed and supported by: +! ================================ +!> \verbatim +!> Developed and coded by Zlatko Drmac, Faculty of Science, +!> University of Zagreb; drmac@math.hr +!> In cooperation with +!> AIMdyn Inc., Santa Barbara, CA. +!> and supported by +!> - DARPA SBIR project "Koopman Operator-Based Forecasting +!> for Nonstationary Processes from Near-Term, Limited +!> Observational Data" Contract No: W31P4Q-21-C-0007 +!> - DARPA PAI project "Physics-Informed Machine Learning +!> Methodologies" Contract No: HR0011-18-9-0033 +!> - DARPA MoDyL project "A Data-Driven, Operator-Theoretic +!> Framework for Space-Time Analysis of Process Dynamics" +!> Contract No: HR0011-16-C-0116 +!> Any opinions, findings and conclusions or recommendations +!> expressed in this material are those of the author and +!> do not necessarily reflect the views of the DARPA SBIR +!> Program Office +!> \endverbatim !...................................................................... +!> \par Distribution Statement A: +! ============================== +!> \verbatim +!> Distribution Statement A: +!> Approved for Public Release, Distribution Unlimited. +!> Cleared by DARPA on September 29, 2022 +!> \endverbatim +!============================================================ ! Arguments ! ========= -! JOBS (input) CHARACTER*1 -! Determines whether the initial data snapshots are scaled -! by a diagonal matrix. -! 'S' :: The data snapshots matrices X and Y are multiplied -! with a diagonal matrix D so that X*D has unit -! nonzero columns (in the Euclidean 2-norm) -! 'C' :: The snapshots are scaled as with the 'S' option. -! If it is found that an i-th column of X is zero -! vector and the corresponding i-th column of Y is -! non-zero, then the i-th column of Y is set to -! zero and a warning flag is raised. -! 'Y' :: The data snapshots matrices X and Y are multiplied -! by a diagonal matrix D so that Y*D has unit -! nonzero columns (in the Euclidean 2-norm) -! 'N' :: No data scaling. +! +!> \param[in] JOBS +!> \verbatim +!> JOBS (input) CHARACTER*1 +!> Determines whether the initial data snapshots are scaled +!> by a diagonal matrix. +!> 'S' :: The data snapshots matrices X and Y are multiplied +!> with a diagonal matrix D so that X*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'C' :: The snapshots are scaled as with the 'S' option. +!> If it is found that an i-th column of X is zero +!> vector and the corresponding i-th column of Y is +!> non-zero, then the i-th column of Y is set to +!> zero and a warning flag is raised. +!> 'Y' :: The data snapshots matrices X and Y are multiplied +!> by a diagonal matrix D so that Y*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'N' :: No data scaling. +!> \endverbatim !..... -! JOBZ (input) CHARACTER*1 -! Determines whether the eigenvectors (Koopman modes) will -! be computed. -! 'V' :: The eigenvectors (Koopman modes) will be computed -! and returned in the matrix Z. -! See the description of Z. -! 'F' :: The eigenvectors (Koopman modes) will be returned -! in factored form as the product X(:,1:K)*W, where X -! contains a POD basis (leading left singular vectors -! of the data matrix X) and W contains the eigenvectors -! of the corresponding Rayleigh quotient. -! See the descriptions of K, X, W, Z. -! 'N' :: The eigenvectors are not computed. +!> \param[in] JOBZ +!> \verbatim +!> JOBZ (input) CHARACTER*1 +!> Determines whether the eigenvectors (Koopman modes) will +!> be computed. +!> 'V' :: The eigenvectors (Koopman modes) will be computed +!> and returned in the matrix Z. +!> See the description of Z. +!> 'F' :: The eigenvectors (Koopman modes) will be returned +!> in factored form as the product X(:,1:K)*W, where X +!> contains a POD basis (leading left singular vectors +!> of the data matrix X) and W contains the eigenvectors +!> of the corresponding Rayleigh quotient. +!> See the descriptions of K, X, W, Z. +!> 'N' :: The eigenvectors are not computed. +!> \endverbatim !..... -! JOBR (input) CHARACTER*1 -! Determines whether to compute the residuals. -! 'R' :: The residuals for the computed eigenpairs will be -! computed and stored in the array RES. -! See the description of RES. -! For this option to be legal, JOBZ must be 'V'. -! 'N' :: The residuals are not computed. +!> \param[in] JOBR +!> \verbatim +!> JOBR (input) CHARACTER*1 +!> Determines whether to compute the residuals. +!> 'R' :: The residuals for the computed eigenpairs will be +!> computed and stored in the array RES. +!> See the description of RES. +!> For this option to be legal, JOBZ must be 'V'. +!> 'N' :: The residuals are not computed. +!> \endverbatim !..... -! JOBF (input) CHARACTER*1 -! Specifies whether to store information needed for post- -! processing (e.g. computing refined Ritz vectors) -! 'R' :: The matrix needed for the refinement of the Ritz -! vectors is computed and stored in the array B. -! See the description of B. -! 'E' :: The unscaled eigenvectors of the Exact DMD are -! computed and returned in the array B. See the -! description of B. -! 'N' :: No eigenvector refinement data is computed. +!> \param[in] JOBF +!> \verbatim +!> JOBF (input) CHARACTER*1 +!> Specifies whether to store information needed for post- +!> processing (e.g. computing refined Ritz vectors) +!> 'R' :: The matrix needed for the refinement of the Ritz +!> vectors is computed and stored in the array B. +!> See the description of B. +!> 'E' :: The unscaled eigenvectors of the Exact DMD are +!> computed and returned in the array B. See the +!> description of B. +!> 'N' :: No eigenvector refinement data is computed. +!> \endverbatim !..... -! WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } -! Allows for a selection of the SVD algorithm from the -! LAPACK library. -! 1 :: SGESVD (the QR SVD algorithm) -! 2 :: SGESDD (the Divide and Conquer algorithm; if enough -! workspace available, this is the fastest option) -! 3 :: SGESVDQ (the preconditioned QR SVD ; this and 4 -! are the most accurate options) -! 4 :: SGEJSV (the preconditioned Jacobi SVD; this and 3 -! are the most accurate options) -! For the four methods above, a significant difference in -! the accuracy of small singular values is possible if -! the snapshots vary in norm so that X is severely -! ill-conditioned. If small (smaller than EPS*||X||) -! singular values are of interest and JOBS=='N', then -! the options (3, 4) give the most accurate results, where -! the option 4 is slightly better and with stronger -! theoretical background. -! If JOBS=='S', i.e. the columns of X will be normalized, -! then all methods give nearly equally accurate results. +!> \param[in] WHTSVD +!> \verbatim +!> WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } +!> Allows for a selection of the SVD algorithm from the +!> LAPACK library. +!> 1 :: SGESVD (the QR SVD algorithm) +!> 2 :: SGESDD (the Divide and Conquer algorithm; if enough +!> workspace available, this is the fastest option) +!> 3 :: SGESVDQ (the preconditioned QR SVD ; this and 4 +!> are the most accurate options) +!> 4 :: SGEJSV (the preconditioned Jacobi SVD; this and 3 +!> are the most accurate options) +!> For the four methods above, a significant difference in +!> the accuracy of small singular values is possible if +!> the snapshots vary in norm so that X is severely +!> ill-conditioned. If small (smaller than EPS*||X||) +!> singular values are of interest and JOBS=='N', then +!> the options (3, 4) give the most accurate results, where +!> the option 4 is slightly better and with stronger +!> theoretical background. +!> If JOBS=='S', i.e. the columns of X will be normalized, +!> then all methods give nearly equally accurate results. +!> \endverbatim !..... -! M (input) INTEGER, M>= 0 -! The state space dimension (the row dimension of X, Y). +!> \param[in] M +!> \verbatim +!> M (input) INTEGER, M>= 0 +!> The state space dimension (the row dimension of X, Y). +!> \endverbatim !..... -! N (input) INTEGER, 0 <= N <= M -! The number of data snapshot pairs -! (the number of columns of X and Y). +!> \param[in] N +!> \verbatim +!> N (input) INTEGER, 0 <= N <= M +!> The number of data snapshot pairs +!> (the number of columns of X and Y). +!> \endverbatim !..... -! X (input/output) REAL(KIND=WP) M-by-N array -! > On entry, X contains the data snapshot matrix X. It is -! assumed that the column norms of X are in the range of -! the normalized floating point numbers. -! < On exit, the leading K columns of X contain a POD basis, -! i.e. the leading K left singular vectors of the input -! data matrix X, U(:,1:K). All N columns of X contain all -! left singular vectors of the input matrix X. -! See the descriptions of K, Z and W. +!> \param[in,out] X +!> \verbatim +!> X (input/output) REAL(KIND=WP) M-by-N array +!> > On entry, X contains the data snapshot matrix X. It is +!> assumed that the column norms of X are in the range of +!> the normalized floating point numbers. +!> < On exit, the leading K columns of X contain a POD basis, +!> i.e. the leading K left singular vectors of the input +!> data matrix X, U(:,1:K). All N columns of X contain all +!> left singular vectors of the input matrix X. +!> See the descriptions of K, Z and W. +!> \endverbatim !..... -! LDX (input) INTEGER, LDX >= M -! The leading dimension of the array X. +!> \param[in] LDX +!> \verbatim +!> LDX (input) INTEGER, LDX >= M +!> The leading dimension of the array X. +!> \endverbatim !..... -! Y (input/workspace/output) REAL(KIND=WP) M-by-N array -! > On entry, Y contains the data snapshot matrix Y -! < On exit, -! If JOBR == 'R', the leading K columns of Y contain -! the residual vectors for the computed Ritz pairs. -! See the description of RES. -! If JOBR == 'N', Y contains the original input data, -! scaled according to the value of JOBS. +!> \param[in,out] Y +!> \verbatim +!> Y (input/workspace/output) REAL(KIND=WP) M-by-N array +!> > On entry, Y contains the data snapshot matrix Y +!> < On exit, +!> If JOBR == 'R', the leading K columns of Y contain +!> the residual vectors for the computed Ritz pairs. +!> See the description of RES. +!> If JOBR == 'N', Y contains the original input data, +!> scaled according to the value of JOBS. +!> \endverbatim !..... -! LDY (input) INTEGER , LDY >= M -! The leading dimension of the array Y. +!> \param[in] LDY +!> \verbatim +!> LDY (input) INTEGER , LDY >= M +!> The leading dimension of the array Y. +!> \endverbatim !..... -! NRNK (input) INTEGER -! Determines the mode how to compute the numerical rank, -! i.e. how to truncate small singular values of the input -! matrix X. On input, if -! NRNK = -1 :: i-th singular value sigma(i) is truncated -! if sigma(i) <= TOL*sigma(1) -! This option is recommended. -! NRNK = -2 :: i-th singular value sigma(i) is truncated -! if sigma(i) <= TOL*sigma(i-1) -! This option is included for R&D purposes. -! It requires highly accurate SVD, which -! may not be feasible. -! The numerical rank can be enforced by using positive -! value of NRNK as follows: -! 0 < NRNK <= N :: at most NRNK largest singular values -! will be used. If the number of the computed nonzero -! singular values is less than NRNK, then only those -! nonzero values will be used and the actually used -! dimension is less than NRNK. The actual number of -! the nonzero singular values is returned in the variable -! K. See the descriptions of TOL and K. +!> \param[in] NRNK +!> \verbatim +!> NRNK (input) INTEGER +!> Determines the mode how to compute the numerical rank, +!> i.e. how to truncate small singular values of the input +!> matrix X. On input, if +!> NRNK = -1 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(1) +!> This option is recommended. +!> NRNK = -2 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(i-1) +!> This option is included for R&D purposes. +!> It requires highly accurate SVD, which +!> may not be feasible. +!> The numerical rank can be enforced by using positive +!> value of NRNK as follows: +!> 0 < NRNK <= N :: at most NRNK largest singular values +!> will be used. If the number of the computed nonzero +!> singular values is less than NRNK, then only those +!> nonzero values will be used and the actually used +!> dimension is less than NRNK. The actual number of +!> the nonzero singular values is returned in the variable +!> K. See the descriptions of TOL and K. +!> \endverbatim !..... -! TOL (input) REAL(KIND=WP), 0 <= TOL < 1 -! The tolerance for truncating small singular values. -! See the description of NRNK. +!> \param[in] TOL +!> \verbatim +!> TOL (input) REAL(KIND=WP), 0 <= TOL < 1 +!> The tolerance for truncating small singular values. +!> See the description of NRNK. +!> \endverbatim !..... -! K (output) INTEGER, 0 <= K <= N -! The dimension of the POD basis for the data snapshot -! matrix X and the number of the computed Ritz pairs. -! The value of K is determined according to the rule set -! by the parameters NRNK and TOL. -! See the descriptions of NRNK and TOL. +!> \param[out] K +!> \verbatim +!> K (output) INTEGER, 0 <= K <= N +!> The dimension of the POD basis for the data snapshot +!> matrix X and the number of the computed Ritz pairs. +!> The value of K is determined according to the rule set +!> by the parameters NRNK and TOL. +!> See the descriptions of NRNK and TOL. +!> \endverbatim !..... -! REIG (output) REAL(KIND=WP) N-by-1 array -! The leading K (K<=N) entries of REIG contain -! the real parts of the computed eigenvalues -! REIG(1:K) + sqrt(-1)*IMEIG(1:K). -! See the descriptions of K, IMEIG, and Z. +!> \param[out] REIG +!> \verbatim +!> REIG (output) REAL(KIND=WP) N-by-1 array +!> The leading K (K<=N) entries of REIG contain +!> the real parts of the computed eigenvalues +!> REIG(1:K) + sqrt(-1)*IMEIG(1:K). +!> See the descriptions of K, IMEIG, and Z. +!> \endverbatim !..... -! IMEIG (output) REAL(KIND=WP) N-by-1 array -! The leading K (K<=N) entries of IMEIG contain -! the imaginary parts of the computed eigenvalues -! REIG(1:K) + sqrt(-1)*IMEIG(1:K). -! The eigenvalues are determined as follows: -! If IMEIG(i) == 0, then the corresponding eigenvalue is -! real, LAMBDA(i) = REIG(i). -! If IMEIG(i)>0, then the corresponding complex -! conjugate pair of eigenvalues reads -! LAMBDA(i) = REIG(i) + sqrt(-1)*IMAG(i) -! LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) -! That is, complex conjugate pairs have consecutive -! indices (i,i+1), with the positive imaginary part -! listed first. -! See the descriptions of K, REIG, and Z. +!> \param[out] IMEIG +!> \verbatim +!> IMEIG (output) REAL(KIND=WP) N-by-1 array +!> The leading K (K<=N) entries of IMEIG contain +!> the imaginary parts of the computed eigenvalues +!> REIG(1:K) + sqrt(-1)*IMEIG(1:K). +!> The eigenvalues are determined as follows: +!> If IMEIG(i) == 0, then the corresponding eigenvalue is +!> real, LAMBDA(i) = REIG(i). +!> If IMEIG(i)>0, then the corresponding complex +!> conjugate pair of eigenvalues reads +!> LAMBDA(i) = REIG(i) + sqrt(-1)*IMAG(i) +!> LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) +!> That is, complex conjugate pairs have consecutive +!> indices (i,i+1), with the positive imaginary part +!> listed first. +!> See the descriptions of K, REIG, and Z. +!> \endverbatim !..... -! Z (workspace/output) REAL(KIND=WP) M-by-N array -! If JOBZ =='V' then -! Z contains real Ritz vectors as follows: -! If IMEIG(i)=0, then Z(:,i) is an eigenvector of -! the i-th Ritz value; ||Z(:,i)||_2=1. -! If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then -! [Z(:,i) Z(:,i+1)] span an invariant subspace and -! the Ritz values extracted from this subspace are -! REIG(i) + sqrt(-1)*IMEIG(i) and -! REIG(i) - sqrt(-1)*IMEIG(i). -! The corresponding eigenvectors are -! Z(:,i) + sqrt(-1)*Z(:,i+1) and -! Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. -! || Z(:,i:i+1)||_F = 1. -! If JOBZ == 'F', then the above descriptions hold for -! the columns of X(:,1:K)*W(1:K,1:K), where the columns -! of W(1:k,1:K) are the computed eigenvectors of the -! K-by-K Rayleigh quotient. The columns of W(1:K,1:K) -! are similarly structured: If IMEIG(i) == 0 then -! X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0 -! then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and -! X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1) -! are the eigenvectors of LAMBDA(i), LAMBDA(i+1). -! See the descriptions of REIG, IMEIG, X and W. +!> \param[out] Z +!> \verbatim +!> Z (workspace/output) REAL(KIND=WP) M-by-N array +!> If JOBZ =='V' then +!> Z contains real Ritz vectors as follows: +!> If IMEIG(i)=0, then Z(:,i) is an eigenvector of +!> the i-th Ritz value; ||Z(:,i)||_2=1. +!> If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then +!> [Z(:,i) Z(:,i+1)] span an invariant subspace and +!> the Ritz values extracted from this subspace are +!> REIG(i) + sqrt(-1)*IMEIG(i) and +!> REIG(i) - sqrt(-1)*IMEIG(i). +!> The corresponding eigenvectors are +!> Z(:,i) + sqrt(-1)*Z(:,i+1) and +!> Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. +!> || Z(:,i:i+1)||_F = 1. +!> If JOBZ == 'F', then the above descriptions hold for +!> the columns of X(:,1:K)*W(1:K,1:K), where the columns +!> of W(1:k,1:K) are the computed eigenvectors of the +!> K-by-K Rayleigh quotient. The columns of W(1:K,1:K) +!> are similarly structured: If IMEIG(i) == 0 then +!> X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0 +!> then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and +!> X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1) +!> are the eigenvectors of LAMBDA(i), LAMBDA(i+1). +!> See the descriptions of REIG, IMEIG, X and W. +!> \endverbatim !..... -! LDZ (input) INTEGER , LDZ >= M -! The leading dimension of the array Z. +!> \param[in] LDZ +!> \verbatim +!> LDZ (input) INTEGER , LDZ >= M +!> The leading dimension of the array Z. +!> \endverbatim !..... -! RES (output) REAL(KIND=WP) N-by-1 array -! RES(1:K) contains the residuals for the K computed -! Ritz pairs. -! If LAMBDA(i) is real, then -! RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. -! If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair -! then -! RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F -! where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] -! [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. -! It holds that -! RES(i) = || A*ZC(:,i) - LAMBDA(i) *ZC(:,i) ||_2 -! RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 -! where ZC(:,i) = Z(:,i) + sqrt(-1)*Z(:,i+1) -! ZC(:,i+1) = Z(:,i) - sqrt(-1)*Z(:,i+1) -! See the description of REIG, IMEIG and Z. +!> \param[out] RES +!> \verbatim +!> RES (output) REAL(KIND=WP) N-by-1 array +!> RES(1:K) contains the residuals for the K computed +!> Ritz pairs. +!> If LAMBDA(i) is real, then +!> RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. +!> If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair +!> then +!> RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F +!> where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] +!> [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. +!> It holds that +!> RES(i) = || A*ZC(:,i) - LAMBDA(i) *ZC(:,i) ||_2 +!> RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 +!> where ZC(:,i) = Z(:,i) + sqrt(-1)*Z(:,i+1) +!> ZC(:,i+1) = Z(:,i) - sqrt(-1)*Z(:,i+1) +!> See the description of REIG, IMEIG and Z. +!> \endverbatim !..... -! B (output) REAL(KIND=WP) M-by-N array. -! IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can -! be used for computing the refined vectors; see further -! details in the provided references. -! If JOBF == 'E', B(1:M,1;K) contains -! A*U(:,1:K)*W(1:K,1:K), which are the vectors from the -! Exact DMD, up to scaling by the inverse eigenvalues. -! If JOBF =='N', then B is not referenced. -! See the descriptions of X, W, K. +!> \param[out] B +!> \verbatim +!> B (output) REAL(KIND=WP) M-by-N array. +!> IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can +!> be used for computing the refined vectors; see further +!> details in the provided references. +!> If JOBF == 'E', B(1:M,1;K) contains +!> A*U(:,1:K)*W(1:K,1:K), which are the vectors from the +!> Exact DMD, up to scaling by the inverse eigenvalues. +!> If JOBF =='N', then B is not referenced. +!> See the descriptions of X, W, K. +!> \endverbatim !..... -! LDB (input) INTEGER, LDB >= M -! The leading dimension of the array B. +!> \param[in] LDB +!> \verbatim +!> LDB (input) INTEGER, LDB >= M +!> The leading dimension of the array B. +!> \endverbatim !..... -! W (workspace/output) REAL(KIND=WP) N-by-N array -! On exit, W(1:K,1:K) contains the K computed -! eigenvectors of the matrix Rayleigh quotient (real and -! imaginary parts for each complex conjugate pair of the -! eigenvalues). The Ritz vectors (returned in Z) are the -! product of X (containing a POD basis for the input -! matrix X) and W. See the descriptions of K, S, X and Z. -! W is also used as a workspace to temporarily store the -! left singular vectors of X. +!> \param[out] W +!> \verbatim +!> W (workspace/output) REAL(KIND=WP) N-by-N array +!> On exit, W(1:K,1:K) contains the K computed +!> eigenvectors of the matrix Rayleigh quotient (real and +!> imaginary parts for each complex conjugate pair of the +!> eigenvalues). The Ritz vectors (returned in Z) are the +!> product of X (containing a POD basis for the input +!> matrix X) and W. See the descriptions of K, S, X and Z. +!> W is also used as a workspace to temporarily store the +!> left singular vectors of X. +!> \endverbatim !..... -! LDW (input) INTEGER, LDW >= N -! The leading dimension of the array W. +!> \param[in] LDW +!> \verbatim +!> LDW (input) INTEGER, LDW >= N +!> The leading dimension of the array W. +!> \endverbatim !..... -! S (workspace/output) REAL(KIND=WP) N-by-N array -! The array S(1:K,1:K) is used for the matrix Rayleigh -! quotient. This content is overwritten during -! the eigenvalue decomposition by SGEEV. -! See the description of K. +!> \param[out] S +!> \verbatim +!> S (workspace/output) REAL(KIND=WP) N-by-N array +!> The array S(1:K,1:K) is used for the matrix Rayleigh +!> quotient. This content is overwritten during +!> the eigenvalue decomposition by SGEEV. +!> See the description of K. +!> \endverbatim !..... -! LDS (input) INTEGER, LDS >= N -! The leading dimension of the array S. +!> \param[in] LDS +!> \verbatim +!> LDS (input) INTEGER, LDS >= N +!> The leading dimension of the array S. +!> \endverbatim !..... -! WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array -! On exit, WORK(1:N) contains the singular values of -! X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). -! If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain -! scaling factor WORK(N+2)/WORK(N+1) used to scale X -! and Y to avoid overflow in the SVD of X. -! This may be of interest if the scaling option is off -! and as many as possible smallest eigenvalues are -! desired to the highest feasible accuracy. -! If the call to SGEDMD is only workspace query, then -! WORK(1) contains the minimal workspace length and -! WORK(2) is the optimal workspace length. Hence, the -! length of work is at least 2. -! See the description of LWORK. +!> \param[out] WORK +!> \verbatim +!> WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array +!> On exit, WORK(1:N) contains the singular values of +!> X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). +!> If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain +!> scaling factor WORK(N+2)/WORK(N+1) used to scale X +!> and Y to avoid overflow in the SVD of X. +!> This may be of interest if the scaling option is off +!> and as many as possible smallest eigenvalues are +!> desired to the highest feasible accuracy. +!> If the call to SGEDMD is only workspace query, then +!> WORK(1) contains the minimal workspace length and +!> WORK(2) is the optimal workspace length. Hence, the +!> length of work is at least 2. +!> See the description of LWORK. +!> \endverbatim !..... -! LWORK (input) INTEGER -! The minimal length of the workspace vector WORK. -! LWORK is calculated as follows: -! If WHTSVD == 1 :: -! If JOBZ == 'V', then -! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)). -! If JOBZ == 'N' then -! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)). -! Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal -! workspace length of SGESVD. -! If WHTSVD == 2 :: -! If JOBZ == 'V', then -! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)) -! If JOBZ == 'N', then -! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)) -! Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the -! minimal workspace length of SGESDD. -! If WHTSVD == 3 :: -! If JOBZ == 'V', then -! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) -! If JOBZ == 'N', then -! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) -! Here LWORK_SVD = N+M+MAX(3*N+1, -! MAX(1,3*N+M,5*N),MAX(1,N)) -! is the minimal workspace length of SGESVDQ. -! If WHTSVD == 4 :: -! If JOBZ == 'V', then -! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) -! If JOBZ == 'N', then -! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) -! Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the -! minimal workspace length of SGEJSV. -! The above expressions are not simplified in order to -! make the usage of WORK more transparent, and for -! easier checking. In any case, LWORK >= 2. -! If on entry LWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! and the optimal workspace lengths for both WORK and -! IWORK. See the descriptions of WORK and IWORK. +!> \param[in] LWORK +!> \verbatim +!> LWORK (input) INTEGER +!> The minimal length of the workspace vector WORK. +!> LWORK is calculated as follows: +!> If WHTSVD == 1 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)). +!> If JOBZ == 'N' then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)). +!> Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal +!> workspace length of SGESVD. +!> If WHTSVD == 2 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)) +!> If JOBZ == 'N', then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)) +!> Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the +!> minimal workspace length of SGESDD. +!> If WHTSVD == 3 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) +!> If JOBZ == 'N', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) +!> Here LWORK_SVD = N+M+MAX(3*N+1, +!> MAX(1,3*N+M,5*N),MAX(1,N)) +!> is the minimal workspace length of SGESVDQ. +!> If WHTSVD == 4 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) +!> If JOBZ == 'N', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) +!> Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the +!> minimal workspace length of SGEJSV. +!> The above expressions are not simplified in order to +!> make the usage of WORK more transparent, and for +!> easier checking. In any case, LWORK >= 2. +!> If on entry LWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths for both WORK and +!> IWORK. See the descriptions of WORK and IWORK. +!> \endverbatim !..... -! IWORK (workspace/output) INTEGER LIWORK-by-1 array -! Workspace that is required only if WHTSVD equals -! 2 , 3 or 4. (See the description of WHTSVD). -! If on entry LWORK =-1 or LIWORK=-1, then the -! minimal length of IWORK is computed and returned in -! IWORK(1). See the description of LIWORK. +!> \param[out] IWORK +!> \verbatim +!> IWORK (workspace/output) INTEGER LIWORK-by-1 array +!> Workspace that is required only if WHTSVD equals +!> 2 , 3 or 4. (See the description of WHTSVD). +!> If on entry LWORK =-1 or LIWORK=-1, then the +!> minimal length of IWORK is computed and returned in +!> IWORK(1). See the description of LIWORK. +!> \endverbatim !..... -! LIWORK (input) INTEGER -! The minimal length of the workspace vector IWORK. -! If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 -! If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) -! If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) -! If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) -! If on entry LIWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! and the optimal workspace lengths for both WORK and -! IWORK. See the descriptions of WORK and IWORK. +!> \param[in] LIWORK +!> \verbatim +!> LIWORK (input) INTEGER +!> The minimal length of the workspace vector IWORK. +!> If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 +!> If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) +!> If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) +!> If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) +!> If on entry LIWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths for both WORK and +!> IWORK. See the descriptions of WORK and IWORK. +!> \endverbatim !..... -! INFO (output) INTEGER -! -i < 0 :: On entry, the i-th argument had an -! illegal value -! = 0 :: Successful return. -! = 1 :: Void input. Quick exit (M=0 or N=0). -! = 2 :: The SVD computation of X did not converge. -! Suggestion: Check the input data and/or -! repeat with different WHTSVD. -! = 3 :: The computation of the eigenvalues did not -! converge. -! = 4 :: If data scaling was requested on input and -! the procedure found inconsistency in the data -! such that for some column index i, -! X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set -! to zero if JOBS=='C'. The computation proceeds -! with original or modified data and warning -! flag is set with INFO=4. +!> \param[out] INFO +!> \verbatim +!> INFO (output) INTEGER +!> -i < 0 :: On entry, the i-th argument had an +!> illegal value +!> = 0 :: Successful return. +!> = 1 :: Void input. Quick exit (M=0 or N=0). +!> = 2 :: The SVD computation of X did not converge. +!> Suggestion: Check the input data and/or +!> repeat with different WHTSVD. +!> = 3 :: The computation of the eigenvalues did not +!> converge. +!> = 4 :: If data scaling was requested on input and +!> the procedure found inconsistency in the data +!> such that for some column index i, +!> X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set +!> to zero if JOBS=='C'. The computation proceeds +!> with original or modified data and warning +!> flag is set with INFO=4. +!> \endverbatim +! +! Authors: +! ======== +! +!> \author Zlatko Drmac +! +!> \ingroup gedmd +! !............................................................. !............................................................. + SUBROUTINE SGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & + M, N, X, LDX, Y, LDY, NRNK, TOL, & + K, REIG, IMEIG, Z, LDZ, RES, & + B, LDB, W, LDW, S, LDS, & + WORK, LWORK, IWORK, LIWORK, INFO ) +! +! -- LAPACK driver routine -- +! +! -- LAPACK is a software package provided by University of -- +! -- Tennessee, University of California Berkeley, University of -- +! -- Colorado Denver and NAG Ltd.. -- +! +!..... + USE iso_fortran_env + IMPLICIT NONE + INTEGER, PARAMETER :: WP = real32 +! +! Scalar arguments +! ~~~~~~~~~~~~~~~~ + CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF + INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & + NRNK, LDZ, LDB, LDW, LDS, & + LWORK, LIWORK + INTEGER, INTENT(OUT) :: K, INFO + REAL(KIND=WP), INTENT(IN) :: TOL +! +! Array arguments +! ~~~~~~~~~~~~~~~ + REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) + REAL(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & + W(LDW,*), S(LDS,*) + REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & + RES(*) + REAL(KIND=WP), INTENT(OUT) :: WORK(*) + INTEGER, INTENT(OUT) :: IWORK(*) +! ! Parameters ! ~~~~~~~~~~ REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP - +! ! Local scalars ! ~~~~~~~~~~~~~ REAL(KIND=WP) :: OFL, ROOTSC, SCALE, SMALL, & @@ -431,11 +581,11 @@ WNTEX, WNTREF, WNTRES, WNTVEC CHARACTER :: JOBZL, T_OR_N CHARACTER :: JSVOPT - +! ! Local arrays ! ~~~~~~~~~~~~ REAL(KIND=WP) :: AB(2,2), RDUMMY(2), RDUMMY2(2) - +! ! External functions (BLAS and LAPACK) ! ~~~~~~~~~~~~~~~~~ REAL(KIND=WP) SLANGE, SLAMCH, SNRM2 @@ -443,13 +593,13 @@ INTEGER ISAMAX LOGICAL SISNAN, LSAME EXTERNAL SISNAN, LSAME - +! ! External subroutines (BLAS and LAPACK) ! ~~~~~~~~~~~~~~~~~~~~ EXTERNAL SAXPY, SGEMM, SSCAL EXTERNAL SGEEV, SGEJSV, SGESDD, SGESVD, SGESVDQ, & SLACPY, SLASCL, SLASSQ, XERBLA - +! ! Intrinsic functions ! ~~~~~~~~~~~~~~~~~~~ INTRINSIC INT, FLOAT, MAX, SQRT @@ -632,7 +782,8 @@ K = 0 DO i = 1, N !WORK(i) = DNRM2( M, X(1,i), 1 ) - SCALE = ZERO + SSUM = ONE + SCALE = ZERO CALL SLASSQ( M, X(1,i), 1, SCALE, SSUM ) IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN K = 0 @@ -705,7 +856,8 @@ ! carefully computed using SLASSQ. DO i = 1, N !WORK(i) = DNRM2( M, Y(1,i), 1 ) - SCALE = ZERO + SSUM = ONE + SCALE = ZERO CALL SLASSQ( M, Y(1,i), 1, SCALE, SSUM ) IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN K = 0 diff --git a/lapack-netlib/SRC/sgehrd.f b/lapack-netlib/SRC/sgehrd.f index 47733d947..cfa17e156 100644 --- a/lapack-netlib/SRC/sgehrd.f +++ b/lapack-netlib/SRC/sgehrd.f @@ -89,7 +89,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension (LWORK) +*> WORK is REAL array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -173,7 +173,7 @@ INTEGER IHI, ILO, INFO, LDA, LWORK, N * .. * .. Array Arguments .. - REAL A( LDA, * ), TAU( * ), WORK( * ) + REAL A( LDA, * ), TAU( * ), WORK( * ) * .. * * ===================================================================== @@ -182,7 +182,7 @@ INTEGER NBMAX, LDT, TSIZE PARAMETER ( NBMAX = 64, LDT = NBMAX+1, $ TSIZE = LDT*NBMAX ) - REAL ZERO, ONE + REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, $ ONE = 1.0E+0 ) * .. @@ -190,7 +190,7 @@ LOGICAL LQUERY INTEGER I, IB, IINFO, IWT, J, LDWORK, LWKOPT, NB, $ NBMIN, NH, NX - REAL EI + REAL EI * .. * .. External Subroutines .. EXTERNAL SAXPY, SGEHD2, SGEMM, SLAHR2, SLARFB, STRMM, @@ -222,13 +222,19 @@ INFO = -8 END IF * + NH = IHI - ILO + 1 IF( INFO.EQ.0 ) THEN * * Compute the workspace requirements * - NB = MIN( NBMAX, ILAENV( 1, 'SGEHRD', ' ', N, ILO, IHI, -1 ) ) - LWKOPT = N*NB + TSIZE - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + NB = MIN( NBMAX, ILAENV( 1, 'SGEHRD', ' ', N, ILO, IHI, + $ -1 ) ) + LWKOPT = N*NB + TSIZE + ENDIF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -249,7 +255,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = 1 RETURN @@ -269,7 +274,7 @@ * * Determine if workspace is large enough for blocked code * - IF( LWORK.LT.N*NB+TSIZE ) THEN + IF( LWORK.LT.LWKOPT ) THEN * * Not enough workspace to use optimal NB: determine the * minimum value of NB, and reduce NB or force use of @@ -345,7 +350,8 @@ * Use unblocked code to reduce the rest of the matrix * CALL SGEHD2( N, I, IHI, A, LDA, TAU, WORK, IINFO ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/sgelq.f b/lapack-netlib/SRC/sgelq.f index 74c7cc267..75f02675d 100644 --- a/lapack-netlib/SRC/sgelq.f +++ b/lapack-netlib/SRC/sgelq.f @@ -98,7 +98,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -295,9 +295,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) ELSE - WORK( 1 ) = SROUNDUP_LWORK(LWREQ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) END IF END IF IF( INFO.NE.0 ) THEN @@ -322,7 +322,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWREQ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) RETURN * * End of SGELQ diff --git a/lapack-netlib/SRC/sgelqf.f b/lapack-netlib/SRC/sgelqf.f index 1ceec4742..3b3913d84 100644 --- a/lapack-netlib/SRC/sgelqf.f +++ b/lapack-netlib/SRC/sgelqf.f @@ -93,7 +93,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,M). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M, otherwise. *> For optimum performance LWORK >= M*NB, where NB is the *> optimal blocksize. *> @@ -175,9 +176,8 @@ * Test the input arguments * INFO = 0 + K = MIN( M, N ) NB = ILAENV( 1, 'SGELQF', ' ', M, N, -1, -1 ) - LWKOPT = M*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -185,19 +185,25 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + ELSE IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) ) + $ INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SGELQF', -INFO ) RETURN ELSE IF( LQUERY ) THEN + IF( K.EQ.0 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = M*NB + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -267,7 +273,7 @@ $ CALL SGELQ2( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK, $ IINFO ) * - WORK( 1 ) = SROUNDUP_LWORK(IWS) + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of SGELQF diff --git a/lapack-netlib/SRC/sgemlq.f b/lapack-netlib/SRC/sgemlq.f index 83536825c..7e4d9bf65 100644 --- a/lapack-netlib/SRC/sgemlq.f +++ b/lapack-netlib/SRC/sgemlq.f @@ -110,13 +110,14 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this *> value as WORK(1), and no error message related to WORK @@ -187,7 +188,7 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -207,7 +208,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) @@ -222,6 +223,13 @@ LW = M * MB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( NB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, NB - K ) .EQ. 0 ) THEN @@ -250,12 +258,12 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = SROUNDUP_LWORK( LW ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -267,7 +275,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -280,7 +288,7 @@ $ MB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = SROUNDUP_LWORK( LW ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * RETURN * diff --git a/lapack-netlib/SRC/sgemqr.f b/lapack-netlib/SRC/sgemqr.f index 3207f8bfd..19bf467b8 100644 --- a/lapack-netlib/SRC/sgemqr.f +++ b/lapack-netlib/SRC/sgemqr.f @@ -189,12 +189,13 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME + EXTERNAL LSAME REAL SROUNDUP_LWORK - EXTERNAL LSAME, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEMQRT, SLAMTSQR, XERBLA @@ -206,7 +207,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) @@ -221,6 +222,13 @@ LW = MB * NB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( MB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, MB - K ).EQ.0 ) THEN @@ -249,12 +257,12 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = SROUNDUP_LWORK(LW) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -266,7 +274,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -279,7 +287,7 @@ $ NB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = SROUNDUP_LWORK(LW) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * RETURN * diff --git a/lapack-netlib/SRC/sgeqlf.f b/lapack-netlib/SRC/sgeqlf.f index b1266c89e..14942b765 100644 --- a/lapack-netlib/SRC/sgeqlf.f +++ b/lapack-netlib/SRC/sgeqlf.f @@ -88,7 +88,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is the *> optimal blocksize. *> @@ -189,8 +190,9 @@ END IF WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * - IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) ) + $ INFO = -7 END IF END IF * diff --git a/lapack-netlib/SRC/sgeqp3rk.f b/lapack-netlib/SRC/sgeqp3rk.f index bb5da72dc..d3a335b88 100644 --- a/lapack-netlib/SRC/sgeqp3rk.f +++ b/lapack-netlib/SRC/sgeqp3rk.f @@ -427,7 +427,8 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*. LWORK >= (3*N + NRHS - 1) +*> LWORK >= 1, if MIN(M,N) = 0, and +*> LWORK >= (3*N+NRHS-1), otherwise. *> For optimal performance LWORK >= (2*N + NB*( N+NRHS+1 )), *> where NB is the optimal block size for SGEQP3RK returned *> by ILAENV. Minimal block size MINNB=2. @@ -618,8 +619,9 @@ * .. External Functions .. LOGICAL SISNAN INTEGER ISAMAX, ILAENV - REAL SLAMCH, SNRM2 - EXTERNAL SISNAN, SLAMCH, SNRM2, ISAMAX, ILAENV + REAL SLAMCH, SNRM2, SROUNDUP_LWORK + EXTERNAL SISNAN, SLAMCH, SNRM2, ISAMAX, ILAENV, + $ SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC REAL, MAX, MIN @@ -696,7 +698,7 @@ * LWKOPT = 2*N + NB*( N+NRHS+1 ) END IF - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN INFO = -15 @@ -719,7 +721,7 @@ K = 0 MAXC2NRMK = ZERO RELMAXC2NRMK = ZERO - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * @@ -772,7 +774,7 @@ * * Array TAU is not set and contains undefined elements. * - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * @@ -791,7 +793,7 @@ TAU( J ) = ZERO END DO * - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * END IF @@ -822,7 +824,7 @@ DO J = 1, MINMN TAU( J ) = ZERO END DO - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * @@ -867,7 +869,7 @@ TAU( J ) = ZERO END DO * - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * @@ -985,7 +987,7 @@ * * Return from the routine. * - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * @@ -1072,7 +1074,7 @@ * END IF * - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/sgeqr.f b/lapack-netlib/SRC/sgeqr.f index 6f41a92ea..79a515e1c 100644 --- a/lapack-netlib/SRC/sgeqr.f +++ b/lapack-netlib/SRC/sgeqr.f @@ -99,7 +99,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -168,6 +168,8 @@ *> *> \endverbatim *> +*> \ingroup geqr +*> * ===================================================================== SUBROUTINE SGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) @@ -188,11 +190,13 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, LMINWS, MINT, MINW - INTEGER MB, NB, MINTSZ, NBLCKS + INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWREQ * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLATSQR, SGEQRT, XERBLA @@ -244,8 +248,10 @@ * * Determine if the workspace size satisfies minimal size * + LWMIN = MAX( 1, N ) + LWREQ = MAX( 1, N*NB ) LMINWS = .FALSE. - IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.NB*N ) + IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.LWREQ ) $ .AND. ( LWORK.GE.N ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( .NOT.LQUERY ) ) THEN IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) ) THEN @@ -253,7 +259,7 @@ NB = 1 MB = M END IF - IF( LWORK.LT.NB*N ) THEN + IF( LWORK.LT.LWREQ ) THEN LMINWS = .TRUE. NB = 1 END IF @@ -268,7 +274,7 @@ ELSE IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN INFO = -6 - ELSE IF( ( LWORK.LT.MAX( 1, N*NB ) ) .AND. ( .NOT.LQUERY ) + ELSE IF( ( LWORK.LT.LWREQ ) .AND. ( .NOT.LQUERY ) $ .AND. ( .NOT.LMINWS ) ) THEN INFO = -8 END IF @@ -282,9 +288,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = MAX( 1, N ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) ELSE - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) END IF END IF IF( INFO.NE.0 ) THEN @@ -309,7 +315,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) * RETURN * diff --git a/lapack-netlib/SRC/sgeqrfp.f b/lapack-netlib/SRC/sgeqrfp.f index d1ee2a828..37747c512 100644 --- a/lapack-netlib/SRC/sgeqrfp.f +++ b/lapack-netlib/SRC/sgeqrfp.f @@ -97,7 +97,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is *> the optimal blocksize. *> @@ -162,8 +163,8 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKOPT, NB, - $ NBMIN, NX + INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKMIN, LWKOPT, + $ NB, NBMIN, NX * .. * .. External Subroutines .. EXTERNAL SGEQR2P, SLARFB, SLARFT, XERBLA @@ -173,8 +174,9 @@ * .. * .. External Functions .. INTEGER ILAENV + EXTERNAL ILAENV REAL SROUNDUP_LWORK - EXTERNAL ILAENV, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -182,8 +184,16 @@ * INFO = 0 NB = ILAENV( 1, 'SGEQRF', ' ', M, N, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + K = MIN( M, N ) + IF( K.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = N + LWKOPT = N*NB + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) +* LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -191,7 +201,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF IF( INFO.NE.0 ) THEN @@ -211,7 +221,7 @@ * NBMIN = 2 NX = 0 - IWS = N + IWS = LWKMIN IF( NB.GT.1 .AND. NB.LT.K ) THEN * * Determine when to cross over from blocked to unblocked code. @@ -273,7 +283,7 @@ $ CALL SGEQR2P( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK, $ IINFO ) * - WORK( 1 ) = SROUNDUP_LWORK(IWS) + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of SGEQRFP diff --git a/lapack-netlib/SRC/sgesvj.f b/lapack-netlib/SRC/sgesvj.f index 3f53a5a15..36aed2853 100644 --- a/lapack-netlib/SRC/sgesvj.f +++ b/lapack-netlib/SRC/sgesvj.f @@ -208,7 +208,7 @@ *> *> \param[in,out] WORK *> \verbatim -*> WORK is REAL array, dimension (LWORK) +*> WORK is REAL array, dimension (MAX(1,LWORK)) *> On entry, *> If JOBU = 'C' : *> WORK(1) = CTOL, where CTOL defines the threshold for convergence. @@ -239,7 +239,12 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> length of WORK, WORK >= MAX(6,M+N) +*> Length of WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(6,M+N), otherwise. +*> +*> If on entry LWORK = -1, then a workspace query is assumed and +*> no computation is done; WORK(1) is set to the minial (and optimal) +*> length of WORK. *> \endverbatim *> *> \param[out] INFO @@ -260,7 +265,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEcomputational +*> \ingroup gesvj * *> \par Further Details: * ===================== @@ -351,9 +356,9 @@ INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1, $ ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34, $ N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, - $ SWBAND - LOGICAL APPLV, GOSCALE, LOWER, LSVEC, NOSCALE, ROTOK, - $ RSVEC, UCTOL, UPPER + $ SWBAND, MINMN, LWMIN + LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, + $ ROTOK, RSVEC, UCTOL, UPPER * .. * .. Local Arrays .. REAL FASTR( 5 ) @@ -369,8 +374,8 @@ INTEGER ISAMAX EXTERNAL ISAMAX * from LAPACK - REAL SLAMCH - EXTERNAL SLAMCH + REAL SLAMCH, SROUNDUP_LWORK + EXTERNAL SLAMCH, SROUNDUP_LWORK LOGICAL LSAME EXTERNAL LSAME * .. @@ -394,6 +399,14 @@ UPPER = LSAME( JOBA, 'U' ) LOWER = LSAME( JOBA, 'L' ) * + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 6, M+N ) + END IF +* + LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.( UPPER .OR. LOWER .OR. LSAME( JOBA, 'G' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( LSVEC .OR. UCTOL .OR. LSAME( JOBU, 'N' ) ) ) THEN @@ -413,7 +426,7 @@ INFO = -11 ELSE IF( UCTOL .AND. ( WORK( 1 ).LE.ONE ) ) THEN INFO = -12 - ELSE IF( LWORK.LT.MAX( M+N, 6 ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. ( .NOT.LQUERY ) ) THEN INFO = -13 ELSE INFO = 0 @@ -423,11 +436,14 @@ IF( INFO.NE.0 ) THEN CALL XERBLA( 'SGESVJ', -INFO ) RETURN + ELSE IF( LQUERY ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) + RETURN END IF * * #:) Quick return for void matrix * - IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) )RETURN + IF( MINMN.EQ.0 ) RETURN * * Set numerical parameters * The stopping criterion for Jacobi rotations is diff --git a/lapack-netlib/SRC/sgetri.f b/lapack-netlib/SRC/sgetri.f index fe71bc4a5..7b06bb63d 100644 --- a/lapack-netlib/SRC/sgetri.f +++ b/lapack-netlib/SRC/sgetri.f @@ -137,8 +137,9 @@ * .. * .. External Functions .. INTEGER ILAENV + EXTERNAL ILAENV REAL SROUNDUP_LWORK - EXTERNAL ILAENV, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEMM, SGEMV, SSWAP, STRSM, STRTRI, XERBLA @@ -152,8 +153,9 @@ * INFO = 0 NB = ILAENV( 1, 'SGETRI', ' ', N, -1, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT = MAX( 1, N*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) +* LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN INFO = -1 @@ -251,7 +253,7 @@ $ CALL SSWAP( N, A( 1, J ), 1, A( 1, JP ), 1 ) 60 CONTINUE * - WORK( 1 ) = SROUNDUP_LWORK(IWS) + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of SGETRI diff --git a/lapack-netlib/SRC/sgetsls.f b/lapack-netlib/SRC/sgetsls.f index d89c6a4e6..08a427a8b 100644 --- a/lapack-netlib/SRC/sgetsls.f +++ b/lapack-netlib/SRC/sgetsls.f @@ -127,7 +127,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. *> If LWORK = -1, the routine calculates optimal size of WORK for the *> optimal performance and returns this value in WORK(1). @@ -226,7 +226,10 @@ * * Determine the optimum and minimum LWORK * - IF( M.GE.N ) THEN + IF( MIN( M, N, NRHS ).EQ.0 ) THEN + WSIZEO = 1 + WSIZEM = 1 + ELSE IF( M.GE.N ) THEN CALL SGEQR( M, N, A, LDA, TQ, -1, WORKQ, -1, INFO2 ) TSZO = INT( TQ( 1 ) ) LWO = INT( WORKQ( 1 ) ) diff --git a/lapack-netlib/SRC/sgetsqrhrt.f b/lapack-netlib/SRC/sgetsqrhrt.f index d80ff4da8..7ade8a66c 100644 --- a/lapack-netlib/SRC/sgetsqrhrt.f +++ b/lapack-netlib/SRC/sgetsqrhrt.f @@ -130,14 +130,17 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The dimension of the array WORK. -*> LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), +*> If MIN(M,N) = 0, LWORK >= 1, else +*> LWORK >= MAX( 1, LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), *> where *> NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)), *> NB1LOCAL = MIN(NB1,N). *> LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL, *> LW1 = NB1LOCAL * N, -*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ), +*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ). +*> *> If LWORK = -1, then a workspace query is assumed. *> The routine only calculates the optimal size of the WORK *> array, returns this value as the first entry of the WORK @@ -216,7 +219,7 @@ * Test the input arguments * INFO = 0 - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. M.LT.N ) THEN @@ -229,7 +232,7 @@ INFO = -5 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -7 - ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN + ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN INFO = -9 ELSE * @@ -267,8 +270,9 @@ LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ) * LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ) + LWORKOPT = MAX( 1, LWORKOPT ) * - IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + IF( LWORK.LT.LWORKOPT .AND. .NOT.LQUERY ) THEN INFO = -11 END IF * @@ -350,4 +354,4 @@ * * End of SGETSQRHRT * - END \ No newline at end of file + END diff --git a/lapack-netlib/SRC/sgges3.f b/lapack-netlib/SRC/sgges3.f index e35d4955a..e90cd6947 100644 --- a/lapack-netlib/SRC/sgges3.f +++ b/lapack-netlib/SRC/sgges3.f @@ -234,6 +234,8 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If N = 0, LWORK >= 1, else LWORK >= 6*N+16. +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -309,7 +311,8 @@ LOGICAL CURSL, ILASCL, ILBSCL, ILVSL, ILVSR, LASTSL, $ LQUERY, LST2SL, WANTST INTEGER I, ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, - $ ILO, IP, IRIGHT, IROWS, ITAU, IWRK, LWKOPT + $ ILO, IP, IRIGHT, IROWS, ITAU, IWRK, LWKOPT, + $ LWKMIN REAL ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, PVSL, $ PVSR, SAFMAX, SAFMIN, SMLNUM * .. @@ -361,6 +364,12 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + IF( N.EQ.0 ) THEN + LWKMIN = 1 + ELSE + LWKMIN = 6*N+16 + END IF +* IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -377,7 +386,7 @@ INFO = -15 ELSE IF( LDVSR.LT.1 .OR. ( ILVSR .AND. LDVSR.LT.N ) ) THEN INFO = -17 - ELSE IF( LWORK.LT.6*N+16 .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -19 END IF * @@ -385,7 +394,7 @@ * IF( INFO.EQ.0 ) THEN CALL SGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( 6*N+16, 3*N+INT( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, 3*N+INT( WORK( 1 ) ) ) CALL SORMQR( 'L', 'T', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) @@ -407,7 +416,11 @@ $ IERR ) LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) END IF - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + END IF END IF * IF( INFO.NE.0 ) THEN @@ -421,6 +434,7 @@ * IF( N.EQ.0 ) THEN SDIM = 0 + WORK( 1 ) = 1 RETURN END IF * @@ -657,7 +671,7 @@ * 40 CONTINUE * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/sggev3.f b/lapack-netlib/SRC/sggev3.f index c82d2187f..d788d1147 100644 --- a/lapack-netlib/SRC/sggev3.f +++ b/lapack-netlib/SRC/sggev3.f @@ -189,6 +189,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER +*> The dimension of the array WORK. LWORK >= MAX(1,8*N). +*> For good performance, LWORK should generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -248,7 +250,8 @@ LOGICAL ILASCL, ILBSCL, ILV, ILVL, ILVR, LQUERY CHARACTER CHTEMP INTEGER ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, ILO, - $ IN, IRIGHT, IROWS, ITAU, IWRK, JC, JR, LWKOPT + $ IN, IRIGHT, IROWS, ITAU, IWRK, JC, JR, LWKOPT, + $ LWKMIN REAL ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, $ SMLNUM, TEMP * .. @@ -298,6 +301,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 8*N ) IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -312,7 +316,7 @@ INFO = -12 ELSE IF( LDVR.LT.1 .OR. ( ILVR .AND. LDVR.LT.N ) ) THEN INFO = -14 - ELSE IF( LWORK.LT.MAX( 1, 8*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF * @@ -320,28 +324,31 @@ * IF( INFO.EQ.0 ) THEN CALL SGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( 1, 8*N, 3*N+INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, 3*N+INT( WORK( 1 ) ) ) CALL SORMQR( 'L', 'T', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) CALL SGGHD3( JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, VL, LDVL, $ VR, LDVR, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) IF( ILVL ) THEN CALL SORGQR( N, N, N, VL, LDVL, WORK, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) CALL SLAQZ0( 'S', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, $ ALPHAR, ALPHAI, BETA, VL, LDVL, VR, LDVR, $ WORK, -1, 0, IERR ) - LWKOPT = MAX( LWKOPT, 2*N+INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) ELSE CALL SLAQZ0( 'E', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, $ ALPHAR, ALPHAI, BETA, VL, LDVL, VR, LDVR, $ WORK, -1, 0, IERR ) - LWKOPT = MAX( LWKOPT, 2*N+INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) + END IF + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF - WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) -* END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/sgghd3.f b/lapack-netlib/SRC/sgghd3.f index 9c5858b5a..01e57088a 100644 --- a/lapack-netlib/SRC/sgghd3.f +++ b/lapack-netlib/SRC/sgghd3.f @@ -179,14 +179,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension (LWORK) +*> WORK is REAL array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> -*> \param[in] LWORK +*> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= 1. +*> The length of the array WORK. LWORK >= 1. *> For optimum performance LWORK >= 6*N*NB, where NB is the *> optimal blocksize. *> @@ -276,7 +276,12 @@ * INFO = 0 NB = ILAENV( 1, 'SGGHD3', ' ', N, ILO, IHI, -1 ) - LWKOPT = MAX( 6*N*NB, 1 ) + NH = IHI - ILO + 1 + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = 6*N*NB + END IF WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) INITQ = LSAME( COMPQ, 'I' ) WANTQ = INITQ .OR. LSAME( COMPQ, 'V' ) @@ -326,7 +331,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = ONE RETURN @@ -886,6 +890,7 @@ IF ( JCOL.LT.IHI ) $ CALL SGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q, $ LDQ, Z, LDZ, IERR ) +* WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN diff --git a/lapack-netlib/SRC/sggqrf.f b/lapack-netlib/SRC/sggqrf.f index ebb42a899..d32b48410 100644 --- a/lapack-netlib/SRC/sggqrf.f +++ b/lapack-netlib/SRC/sggqrf.f @@ -236,8 +236,9 @@ * .. * .. External Functions .. INTEGER ILAENV + EXTERNAL ILAENV REAL SROUNDUP_LWORK - EXTERNAL ILAENV, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN @@ -251,8 +252,9 @@ NB2 = ILAENV( 1, 'SGERQF', ' ', N, P, -1, -1 ) NB3 = ILAENV( 1, 'SORMQR', ' ', N, M, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P )*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) +* LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN INFO = -1 @@ -289,6 +291,7 @@ * CALL SGERQF( N, P, B, LDB, TAUB, WORK, LWORK, INFO ) LWKOPT = MAX( LOPT, INT( WORK( 1 ) ) ) +* WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN diff --git a/lapack-netlib/SRC/sggrqf.f b/lapack-netlib/SRC/sggrqf.f index 2163f1ef8..b3842ec2a 100644 --- a/lapack-netlib/SRC/sggrqf.f +++ b/lapack-netlib/SRC/sggrqf.f @@ -250,7 +250,7 @@ NB2 = ILAENV( 1, 'SGEQRF', ' ', P, N, -1, -1 ) NB3 = ILAENV( 1, 'SORMRQ', ' ', M, N, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P)*NB + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN diff --git a/lapack-netlib/SRC/sggsvd3.f b/lapack-netlib/SRC/sggsvd3.f index 053fff5de..cee630593 100644 --- a/lapack-netlib/SRC/sggsvd3.f +++ b/lapack-netlib/SRC/sggsvd3.f @@ -278,7 +278,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns diff --git a/lapack-netlib/SRC/sggsvp3.f b/lapack-netlib/SRC/sggsvp3.f index a463b9064..8e90d770c 100644 --- a/lapack-netlib/SRC/sggsvp3.f +++ b/lapack-netlib/SRC/sggsvp3.f @@ -227,7 +227,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -300,8 +300,9 @@ * .. * .. External Functions .. LOGICAL LSAME + EXTERNAL LSAME REAL SROUNDUP_LWORK - EXTERNAL LSAME, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEQP3, SGEQR2, SGERQ2, SLACPY, SLAPMT, diff --git a/lapack-netlib/SRC/slamswlq.f b/lapack-netlib/SRC/slamswlq.f index d4996b1f2..432afaded 100644 --- a/lapack-netlib/SRC/slamswlq.f +++ b/lapack-netlib/SRC/slamswlq.f @@ -127,17 +127,20 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> If SIDE = 'L', LWORK >= max(1,NB) * MB; -*> if SIDE = 'R', LWORK >= max(1,M) * MB. +*> +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,NB*MB). +*> If SIDE = 'R', LWORK >= max(1,M*MB). *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -189,33 +192,38 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamswlq +*> * ===================================================================== SUBROUTINE SLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - REAL A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + REAL A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. External Subroutines .. EXTERNAL STPMLQT, SGEMLQT, XERBLA * .. @@ -223,52 +231,60 @@ * * Test the input arguments * - LQUERY = LWORK.LT.0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * MB ELSE LW = M * MB END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * INFO = 0 IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( K.LT.0 ) THEN INFO = -5 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN INFO = -4 - ELSE IF( K.LT.MB .OR. MB.LT.1) THEN + ELSE IF( K.LT.MB .OR. MB.LT.1 ) THEN INFO = -6 ELSE IF( LDA.LT.MAX( 1, K ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, MB) ) THEN + ELSE IF( LDT.LT.MAX( 1, MB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) + END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SLAMSWLQ', -INFO ) - WORK(1) = LW RETURN - ELSE IF (LQUERY) THEN - WORK(1) = LW + ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -402,7 +418,7 @@ * END IF * - WORK(1) = LW + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of SLAMSWLQ diff --git a/lapack-netlib/SRC/slamtsqr.f b/lapack-netlib/SRC/slamtsqr.f index 960b794de..f9b167aea 100644 --- a/lapack-netlib/SRC/slamtsqr.f +++ b/lapack-netlib/SRC/slamtsqr.f @@ -128,22 +128,24 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) REAL array, dimension (MAX(1,LWORK)) -*> +*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,N*NB). +*> If SIDE = 'R', LWORK >= max(1,MB*NB). *> -*> If SIDE = 'L', LWORK >= max(1,N)*NB; -*> if SIDE = 'R', LWORK >= max(1,MB)*NB. *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -191,33 +193,38 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamtsqr +*> * ===================================================================== SUBROUTINE SLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - REAL A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + REAL A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR, Q + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, Q, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. External Subroutines .. EXTERNAL SGEMQRT, STPMQRT, XERBLA * .. @@ -225,12 +232,13 @@ * * Test the input arguments * - LQUERY = LWORK.LT.0 + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * NB Q = M ELSE @@ -238,11 +246,17 @@ Q = N END IF * - INFO = 0 + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF +* IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN @@ -253,38 +267,38 @@ INFO = -7 ELSE IF( LDA.LT.MAX( 1, Q ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, NB) ) THEN + ELSE IF( LDT.LT.MAX( 1, NB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN. AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * -* Determine the block size if it is tall skinny or short and wide -* - IF( INFO.EQ.0) THEN - WORK(1) = LW + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'SLAMTSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF +* +* Determine the block size if it is tall skinny or short and wide * IF((MB.LE.K).OR.(MB.GE.MAX(M,N,K))) THEN CALL SGEMQRT( SIDE, TRANS, M, N, K, NB, A, LDA, - $ T, LDT, C, LDC, WORK, INFO) + $ T, LDT, C, LDC, WORK, INFO ) RETURN - END IF + END IF * IF(LEFT.AND.NOTRAN) THEN * @@ -410,7 +424,7 @@ * END IF * - WORK(1) = LW + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of SLAMTSQR diff --git a/lapack-netlib/SRC/slaswlq.f b/lapack-netlib/SRC/slaswlq.f index 685f823a0..594c646db 100644 --- a/lapack-netlib/SRC/slaswlq.f +++ b/lapack-netlib/SRC/slaswlq.f @@ -96,22 +96,24 @@ *> The leading dimension of the array T. LDT >= MB. *> \endverbatim *> -*> *> \param[out] WORK *> \verbatim -*> (workspace) REAL array, dimension (MAX(1,LWORK)) -*> +*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= MB * M. +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MB*M, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim + *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -163,32 +165,35 @@ *> * ===================================================================== SUBROUTINE SLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, - $ INFO) + $ INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT + INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT * .. * .. Array Arguments .. - REAL A( LDA, * ), WORK( * ), T( LDT, *) + REAL A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, MINMN, LWMIN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME + EXTERNAL LSAME REAL SROUNDUP_LWORK - EXTERNAL LSAME, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. EXTERNAL SUBROUTINES .. EXTERNAL SGELQT, SGEQRT, STPLQT, STPQRT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -199,12 +204,19 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = M*MB + END IF * IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. N.LT.M ) THEN INFO = -2 - ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 )) THEN + ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 ) ) THEN INFO = -3 ELSE IF( NB.LE.0 ) THEN INFO = -4 @@ -212,60 +224,60 @@ INFO = -6 ELSE IF( LDT.LT.MB ) THEN INFO = -8 - ELSE IF( ( LWORK.LT.M*MB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = MB*M + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'SLASWLQ', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The LQ Decomposition * - IF((M.GE.N).OR.(NB.LE.M).OR.(NB.GE.N)) THEN - CALL SGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO) + IF( (M.GE.N) .OR. (NB.LE.M) .OR. (NB.GE.N) ) THEN + CALL SGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO ) RETURN - END IF + END IF * - KK = MOD((N-M),(NB-M)) - II=N-KK+1 + KK = MOD((N-M),(NB-M)) + II = N-KK+1 * -* Compute the LQ factorization of the first block A(1:M,1:NB) +* Compute the LQ factorization of the first block A(1:M,1:NB) * - CALL SGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO) - CTR = 1 + CALL SGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO ) + CTR = 1 * - DO I = NB+1, II-NB+M , (NB-M) + DO I = NB+1, II-NB+M, (NB-M) * -* Compute the QR factorization of the current block A(1:M,I:I+NB-M) +* Compute the QR factorization of the current block A(1:M,I:I+NB-M) * - CALL STPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), - $ LDA, T(1, CTR * M + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CALL STPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), + $ LDA, T(1, CTR * M + 1), + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * * Compute the QR factorization of the last block A(1:M,II:N) * - IF (II.LE.N) THEN + IF( II.LE.N ) THEN CALL STPLQT( M, KK, 0, MB, A(1,1), LDA, A( 1, II ), - $ LDA, T(1, CTR * M + 1), LDT, - $ WORK, INFO ) - END IF + $ LDA, T(1, CTR * M + 1), LDT, + $ WORK, INFO ) + END IF * - WORK( 1 ) = SROUNDUP_LWORK(M * MB) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of SLASWLQ diff --git a/lapack-netlib/SRC/slatrs3.f b/lapack-netlib/SRC/slatrs3.f index 8f0c4bf16..17052289e 100644 --- a/lapack-netlib/SRC/slatrs3.f +++ b/lapack-netlib/SRC/slatrs3.f @@ -151,13 +151,16 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension (LWORK). -*> On exit, if INFO = 0, WORK(1) returns the optimal size of -*> WORK. +*> WORK is REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK +*> \verbatim *> LWORK is INTEGER +*> The dimension of the array WORK. +*> +*> If MIN(N,NRHS) = 0, LWORK >= 1, else *> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where *> NBA = (N + NB - 1)/NB and NB is the optimal block size. *> @@ -165,6 +168,7 @@ *> only calculates the optimal dimensions of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. +*> \endverbatim *> *> \param[out] INFO *> \verbatim @@ -181,7 +185,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERauxiliary +*> \ingroup latrs3 *> \par Further Details: * ===================== * \verbatim @@ -253,7 +257,7 @@ LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, - $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS, LWMIN REAL ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, $ SCAMIN, SMLNUM, TMAX * .. @@ -264,7 +268,8 @@ EXTERNAL ILAENV, LSAME, SLAMCH, SLANGE, SLARMM * .. * .. External Subroutines .. - EXTERNAL SLATRS, SSCAL, XERBLA + REAL SROUNDUP_LWORK + EXTERNAL SLATRS, SSCAL, SROUNDUP_LWORK, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN @@ -292,15 +297,24 @@ * row. WORK( I + KK * LDS ) is the scale factor of the vector * segment associated with the I-th block row and the KK-th vector * in the block column. +* LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) LDS = NBA +* * The second part stores upper bounds of the triangular A. There are * a total of NBA x NBA blocks, of which only the upper triangular * part or the lower triangular part is referenced. The upper bound of * the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). +* LANRM = NBA * NBA AWRK = LSCALE - WORK( 1 ) = LSCALE + LANRM +* + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = LSCALE + LANRM + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * * Test the input parameters. * @@ -322,7 +336,7 @@ INFO = -8 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -10 - ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.LWMIN ) THEN INFO = -14 END IF IF( INFO.NE.0 ) THEN @@ -650,6 +664,8 @@ END DO END DO RETURN +* + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * * End of SLATRS3 * diff --git a/lapack-netlib/SRC/slatsqr.f b/lapack-netlib/SRC/slatsqr.f index 86733bb15..4730815b5 100644 --- a/lapack-netlib/SRC/slatsqr.f +++ b/lapack-netlib/SRC/slatsqr.f @@ -101,15 +101,18 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= NB*N. +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= NB*N, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -161,33 +164,39 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup latsqr +*> * ===================================================================== SUBROUTINE SLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, - $ LWORK, INFO) + $ LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK + INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK * .. * .. Array Arguments .. - REAL A( LDA, * ), WORK( * ), T(LDT, *) + REAL A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, MINMN, LWMIN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. EXTERNAL SUBROUTINES .. EXTERNAL SGEQRT, STPQRT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -198,6 +207,13 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = N*NB + END IF * IF( M.LT.0 ) THEN INFO = -1 @@ -205,64 +221,65 @@ INFO = -2 ELSE IF( MB.LT.1 ) THEN INFO = -3 - ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 )) THEN + ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 ) ) THEN INFO = -4 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -6 ELSE IF( LDT.LT.NB ) THEN INFO = -8 - ELSE IF( LWORK.LT.(N*NB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = NB*N +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SLATSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The QR Decomposition * - IF ((MB.LE.N).OR.(MB.GE.M)) THEN - CALL SGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO) - RETURN - END IF - KK = MOD((M-N),(MB-N)) - II=M-KK+1 + IF( (MB.LE.N) .OR. (MB.GE.M) ) THEN + CALL SGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO ) + RETURN + END IF + KK = MOD((M-N),(MB-N)) + II = M-KK+1 * -* Compute the QR factorization of the first block A(1:MB,1:N) +* Compute the QR factorization of the first block A(1:MB,1:N) * - CALL SGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) + CALL SGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) * - CTR = 1 - DO I = MB+1, II-MB+N , (MB-N) + CTR = 1 + DO I = MB+1, II-MB+N, (MB-N) * -* Compute the QR factorization of the current block A(I:I+MB-N,1:N) +* Compute the QR factorization of the current block A(I:I+MB-N,1:N) * - CALL STPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, - $ T(1, CTR * N + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CALL STPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, + $ T(1, CTR * N + 1), + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * -* Compute the QR factorization of the last block A(II:M,1:N) +* Compute the QR factorization of the last block A(II:M,1:N) * - IF (II.LE.M) THEN - CALL STPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, - $ T(1, CTR * N + 1), LDT, - $ WORK, INFO ) - END IF + IF( II.LE.M ) THEN + CALL STPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, + $ T(1, CTR * N + 1), LDT, + $ WORK, INFO ) + END IF * - work( 1 ) = N*NB + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of SLATSQR diff --git a/lapack-netlib/SRC/ssyevd.f b/lapack-netlib/SRC/ssyevd.f index a5e4638d6..2ae44fc81 100644 --- a/lapack-netlib/SRC/ssyevd.f +++ b/lapack-netlib/SRC/ssyevd.f @@ -96,8 +96,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, -*> dimension (LWORK) +*> WORK is REAL array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -251,7 +250,7 @@ $ N*ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) ) LIOPT = LIWMIN END IF - WORK( 1 ) = SROUNDUP_LWORK(LOPT) + WORK( 1 ) = SROUNDUP_LWORK( LOPT ) IWORK( 1 ) = LIOPT * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -335,7 +334,7 @@ IF( ISCALE.EQ.1 ) $ CALL SSCAL( N, ONE / SIGMA, W, 1 ) * - WORK( 1 ) = SROUNDUP_LWORK(LOPT) + WORK( 1 ) = SROUNDUP_LWORK( LOPT ) IWORK( 1 ) = LIOPT * RETURN diff --git a/lapack-netlib/SRC/ssyevr.f b/lapack-netlib/SRC/ssyevr.f index 47e4d7cbf..870facd60 100644 --- a/lapack-netlib/SRC/ssyevr.f +++ b/lapack-netlib/SRC/ssyevr.f @@ -271,7 +271,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,26*N). +*> The dimension of the array WORK. +*> If N <= 1, LWORK >= 1, else LWORK >= 26*N. *> For optimal efficiency, LWORK >= (NB+6)*N, *> where NB is the max of the blocksize for SSYTRD and SORMTR *> returned by ILAENV. @@ -292,7 +293,8 @@ *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK and @@ -392,8 +394,13 @@ * LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LIWORK.EQ.-1 ) ) * - LWMIN = MAX( 1, 26*N ) - LIWMIN = MAX( 1, 10*N ) + IF( N.LE.1 ) THEN + LWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = 26*N + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN @@ -428,7 +435,7 @@ NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'SORMTR', UPLO, N, -1, -1, -1 ) ) LWKOPT = MAX( ( NB+1 )*N, LWMIN ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) IWORK( 1 ) = LIWMIN * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -677,7 +684,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/ssyevr_2stage.f b/lapack-netlib/SRC/ssyevr_2stage.f index a2d6a6231..471e25977 100644 --- a/lapack-netlib/SRC/ssyevr_2stage.f +++ b/lapack-netlib/SRC/ssyevr_2stage.f @@ -278,6 +278,7 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If N <= 1, LWORK must be at least 1. *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, 26*N, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + 5*N @@ -300,13 +301,14 @@ *> \param[out] IWORK *> \verbatim *> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) -*> On exit, if INFO = 0, IWORK(1) returns the optimal LWORK. +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. *> \endverbatim *> *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the IWORK array, @@ -445,8 +447,14 @@ IB = ILAENV2STAGE( 2, 'SSYTRD_2STAGE', JOBZ, N, KD, -1, -1 ) LHTRD = ILAENV2STAGE( 3, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) LWTRD = ILAENV2STAGE( 4, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) - LWMIN = MAX( 26*N, 5*N + LHTRD + LWTRD ) - LIWMIN = MAX( 1, 10*N ) +* + IF( N.LE.1 ) THEN + LWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = MAX( 26*N, 5*N + LHTRD + LWTRD ) + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN @@ -485,7 +493,7 @@ * NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) * NB = MAX( NB, ILAENV( 1, 'SORMTR', UPLO, N, -1, -1, -1 ) ) * LWKOPT = MAX( ( NB+1 )*N, LWMIN ) - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) IWORK( 1 ) = LIWMIN END IF * @@ -505,7 +513,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 26 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = A( 1, 1 ) @@ -733,7 +741,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/ssyevx.f b/lapack-netlib/SRC/ssyevx.f index 2204aa39b..aaed6dad5 100644 --- a/lapack-netlib/SRC/ssyevx.f +++ b/lapack-netlib/SRC/ssyevx.f @@ -338,14 +338,14 @@ IF( INFO.EQ.0 ) THEN IF( N.LE.1 ) THEN LWKMIN = 1 - WORK( 1 ) = SROUNDUP_LWORK(LWKMIN) + LWKOPT = 1 ELSE LWKMIN = 8*N NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'SORMTR', UPLO, N, -1, -1, -1 ) ) LWKOPT = MAX( LWKMIN, ( NB + 3 )*N ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) $ INFO = -17 @@ -542,7 +542,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/ssysv_aa.f b/lapack-netlib/SRC/ssysv_aa.f index e43d4de7f..711a275e1 100644 --- a/lapack-netlib/SRC/ssysv_aa.f +++ b/lapack-netlib/SRC/ssysv_aa.f @@ -177,12 +177,13 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER LWKOPT, LWKOPT_SYTRF, LWKOPT_SYTRS + INTEGER LWKMIN, LWKOPT, LWKOPT_SYTRF, LWKOPT_SYTRS * .. * .. External Functions .. LOGICAL LSAME + EXTERNAL LSAME REAL SROUNDUP_LWORK - EXTERNAL LSAME, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, SSYTRS_AA, SSYTRF_AA @@ -196,6 +197,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N, 3*N-2 ) IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -206,18 +208,18 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX(2*N, 3*N-2) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF * IF( INFO.EQ.0 ) THEN CALL SSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT_SYTRF = INT( WORK(1) ) + LWKOPT_SYTRF = INT( WORK( 1 ) ) CALL SSYTRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, $ -1, INFO ) - LWKOPT_SYTRS = INT( WORK(1) ) - LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT_SYTRS = INT( WORK( 1 ) ) + LWKOPT = MAX( LWKMIN, LWKOPT_SYTRF, LWKOPT_SYTRS ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -239,7 +241,7 @@ * END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/ssysv_aa_2stage.f b/lapack-netlib/SRC/ssysv_aa_2stage.f index 3d88e068e..fb068b3bf 100644 --- a/lapack-netlib/SRC/ssysv_aa_2stage.f +++ b/lapack-netlib/SRC/ssysv_aa_2stage.f @@ -100,14 +100,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is REAL array, dimension (LTB) +*> TB is REAL array, dimension (MAX(1,LTB)) *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -147,14 +147,15 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL workspace of size LWORK +*> WORK is REAL workspace of size (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used to +*> select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -204,12 +205,13 @@ * .. * .. Local Scalars .. LOGICAL UPPER, TQUERY, WQUERY - INTEGER LWKOPT + INTEGER LWKMIN, LWKOPT * .. * .. External Functions .. LOGICAL LSAME + EXTERNAL LSAME REAL SROUNDUP_LWORK - EXTERNAL LSAME, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SSYTRF_AA_2STAGE, SSYTRS_AA_2STAGE, @@ -226,6 +228,7 @@ UPPER = LSAME( UPLO, 'U' ) WQUERY = ( LWORK.EQ.-1 ) TQUERY = ( LTB.EQ.-1 ) + LWKMIN = MAX( 1, N ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -234,18 +237,19 @@ INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 - ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 - ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.WQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL SSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) - LWKOPT = INT( WORK(1) ) + LWKOPT = MAX( LWKMIN, INT( WORK( 1 ) ) ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -255,7 +259,6 @@ RETURN END IF * -* * Compute the factorization A = U**T*T*U or A = L*T*L**T. * CALL SSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2, @@ -269,7 +272,7 @@ * END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/ssysvx.f b/lapack-netlib/SRC/ssysvx.f index 0d72217eb..06a6413f1 100644 --- a/lapack-netlib/SRC/ssysvx.f +++ b/lapack-netlib/SRC/ssysvx.f @@ -305,7 +305,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, NOFACT - INTEGER LWKOPT, NB + INTEGER LWKMIN, LWKOPT, NB REAL ANORM * .. * .. External Functions .. @@ -327,6 +327,7 @@ INFO = 0 NOFACT = LSAME( FACT, 'N' ) LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 3*N ) IF( .NOT.NOFACT .AND. .NOT.LSAME( FACT, 'F' ) ) THEN INFO = -1 ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) @@ -344,12 +345,12 @@ INFO = -11 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -13 - ELSE IF( LWORK.LT.MAX( 1, 3*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -18 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = MAX( 1, 3*N ) + LWKOPT = LWKMIN IF( NOFACT ) THEN NB = ILAENV( 1, 'SSYTRF', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( LWKOPT, N*NB ) diff --git a/lapack-netlib/SRC/ssytrd_2stage.f b/lapack-netlib/SRC/ssytrd_2stage.f index 5d70ae0d4..5b401c3d0 100644 --- a/lapack-netlib/SRC/ssytrd_2stage.f +++ b/lapack-netlib/SRC/ssytrd_2stage.f @@ -4,23 +4,23 @@ * * =========== DOCUMENTATION =========== * -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ * *> \htmlonly -*> Download SSYTRD_2STAGE + dependencies -*> -*> [TGZ] -*> -*> [ZIP] -*> +*> Download SSYTRD_2STAGE + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> *> [TXT] -*> \endhtmlonly +*> \endhtmlonly * * Definition: * =========== * -* SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, +* SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, * HOUS2, LHOUS2, WORK, LWORK, INFO ) * * IMPLICIT NONE @@ -34,7 +34,7 @@ * REAL A( LDA, * ), TAU( * ), * HOUS2( * ), WORK( * ) * .. -* +* * *> \par Purpose: * ============= @@ -52,11 +52,11 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> in particular for the second stage (Band to *> tridiagonal) and thus LHOUS2 is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate Q1 Q2 or to apply Q1 Q2, +*> = 'V': the Householder representation is needed to +*> either generate Q1 Q2 or to apply Q1 Q2, *> then LHOUS2 is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -86,7 +86,7 @@ *> triangular part of A is not referenced. *> On exit, if UPLO = 'U', the band superdiagonal *> of A are overwritten by the corresponding elements of the -*> internal band-diagonal matrix AB, and the elements above +*> internal band-diagonal matrix AB, and the elements above *> the KD superdiagonal, with the array TAU, represent the orthogonal *> matrix Q1 as a product of elementary reflectors; if UPLO *> = 'L', the diagonal and band subdiagonal of A are over- @@ -117,13 +117,13 @@ *> \param[out] TAU *> \verbatim *> TAU is REAL array, dimension (N-KD) -*> The scalar factors of the elementary reflectors of +*> The scalar factors of the elementary reflectors of *> the first stage (see Further Details). *> \endverbatim *> *> \param[out] HOUS2 *> \verbatim -*> HOUS2 is REAL array, dimension (LHOUS2) +*> HOUS2 is REAL array, dimension (MAX(1,LHOUS2)) *> Stores the Householder representation of the stage2 *> band to tridiagonal. *> \endverbatim @@ -132,6 +132,8 @@ *> \verbatim *> LHOUS2 is INTEGER *> The dimension of the array HOUS2. +*> LHOUS2 >= 1. +*> *> If LWORK = -1, or LHOUS2 = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS2 array, returns @@ -149,17 +151,19 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS2=-1, +*> The dimension of the array WORK. +*> If N = 0, LWORK >= 1, else LWORK = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS2 = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> LWORK = MAX(1, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) -*> + (KD+1)*N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ *> algorithm, usually FACTOPTNB=128 is a good choice @@ -177,12 +181,12 @@ * Authors: * ======== * -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. * -*> \ingroup realSYcomputational +*> \ingroup hetrd_2stage * *> \par Further Details: * ===================== @@ -202,7 +206,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -210,16 +214,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, + SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, $ HOUS2, LHOUS2, WORK, LWORK, INFO ) * IMPLICIT NONE @@ -265,10 +269,13 @@ * KD = ILAENV2STAGE( 1, 'SSYTRD_2STAGE', VECT, N, -1, -1, -1 ) IB = ILAENV2STAGE( 2, 'SSYTRD_2STAGE', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 ) -* WRITE(*,*),'SSYTRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO, -* $ LHMIN, LWMIN + IF( N.EQ.0 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.LSAME( VECT, 'N' ) ) THEN INFO = -1 @@ -309,14 +316,14 @@ LWRK = LWORK-LDAB*N ABPOS = 1 WPOS = ABPOS + LDAB*N - CALL SSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, + CALL SSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, $ TAU, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'SSYTRD_SY2SB', -INFO ) RETURN END IF - CALL SSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD, - $ WORK( ABPOS ), LDAB, D, E, + CALL SSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD, + $ WORK( ABPOS ), LDAB, D, E, $ HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'SSYTRD_SB2ST', -INFO ) @@ -324,8 +331,7 @@ END IF * * - HOUS2( 1 ) = LHMIN - WORK( 1 ) = LWMIN + WORK( 1 ) = LWMIN RETURN * * End of SSYTRD_2STAGE diff --git a/lapack-netlib/SRC/ssytrd_sb2st.F b/lapack-netlib/SRC/ssytrd_sb2st.F index 32bae26dc..111eaa93e 100644 --- a/lapack-netlib/SRC/ssytrd_sb2st.F +++ b/lapack-netlib/SRC/ssytrd_sb2st.F @@ -132,15 +132,17 @@ *> *> \param[out] HOUS *> \verbatim -*> HOUS is REAL array, dimension LHOUS, that -*> store the Householder representation. +*> HOUS is REAL array, dimension (MAX(1,LHOUS)) +*> Stores the Householder representation. *> \endverbatim *> *> \param[in] LHOUS *> \verbatim *> LHOUS is INTEGER -*> The dimension of the array HOUS. LHOUS = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array HOUS. +*> If N = 0 or KD <= 1, LHOUS >= 1, else LHOUS = MAX(1, dimension) +*> +*> If LWORK = -1, or LHOUS = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS array, returns *> this value as the first entry of the HOUS array, and no error @@ -152,14 +154,17 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension LWORK. +*> WORK is REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array WORK. +*> IF N = 0 or KD <= 1, LWORK >= 1, else LWORK = MAX(1, dimension) +*> +*> If LWORK = -1, or LHOUS = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -261,7 +266,7 @@ INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, $ ED, STIND, EDIND, BLKLASTIND, COLPT, THED, $ STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID, - $ NBTILES, TTYPE, TID, NTHREADS, DEBUG, + $ NBTILES, TTYPE, TID, NTHREADS, $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, $ INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU, $ SISEV, SIZETAU, LDV, LHMIN, LWMIN @@ -283,7 +288,6 @@ * Determine the minimal workspace size required. * Test the input parameters * - DEBUG = 0 INFO = 0 AFTERS1 = LSAME( STAGE1, 'Y' ) WANTQ = LSAME( VECT, 'V' ) @@ -292,9 +296,14 @@ * * Determine the block size, the workspace size and the hous size. * - IB = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + IB = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 ) + IF( N.EQ.0 .OR. KD.LE.1 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN INFO = -1 @@ -315,8 +324,8 @@ END IF * IF( INFO.EQ.0 ) THEN - HOUS( 1 ) = LHMIN - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + HOUS( 1 ) = SROUNDUP_LWORK( LHMIN ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -544,8 +553,7 @@ 170 CONTINUE ENDIF * - HOUS( 1 ) = LHMIN - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of SSYTRD_SB2ST diff --git a/lapack-netlib/SRC/ssytrd_sy2sb.f b/lapack-netlib/SRC/ssytrd_sy2sb.f index 4efc43630..3996e07bb 100644 --- a/lapack-netlib/SRC/ssytrd_sy2sb.f +++ b/lapack-netlib/SRC/ssytrd_sy2sb.f @@ -124,7 +124,7 @@ *> \param[out] WORK *> \verbatim *> WORK is REAL array, dimension (LWORK) -*> On exit, if INFO = 0, or if LWORK=-1, +*> On exit, if INFO = 0, or if LWORK = -1, *> WORK(1) returns the size of LWORK. *> \endverbatim *> @@ -132,7 +132,9 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK which should be calculated -*> by a workspace query. LWORK = MAX(1, LWORK_QUERY) +*> by a workspace query. +*> If N <= KD+1, LWORK >= 1, else LWORK = MAX(1, LWORK_QUERY) +*> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -294,8 +296,12 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) - LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SY2SB', '', N, KD, -1, -1 ) - + IF( N.LE.KD+1 ) THEN + LWMIN = 1 + ELSE + LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SY2SB', '', N, KD, -1, -1 ) + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -314,7 +320,7 @@ CALL XERBLA( 'SSYTRD_SY2SB', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN END IF * @@ -507,7 +513,7 @@ END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of SSYTRD_SY2SB diff --git a/lapack-netlib/SRC/ssytrf.f b/lapack-netlib/SRC/ssytrf.f index a788fbcf0..55f3a4f0f 100644 --- a/lapack-netlib/SRC/ssytrf.f +++ b/lapack-netlib/SRC/ssytrf.f @@ -234,7 +234,7 @@ * NB = ILAENV( 1, 'SSYTRF', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, N*NB ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -353,7 +353,8 @@ END IF * 40 CONTINUE - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of SSYTRF diff --git a/lapack-netlib/SRC/ssytrf_aa.f b/lapack-netlib/SRC/ssytrf_aa.f index d6408a978..af32fb064 100644 --- a/lapack-netlib/SRC/ssytrf_aa.f +++ b/lapack-netlib/SRC/ssytrf_aa.f @@ -101,8 +101,10 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= MAX(1,2*N). For optimum performance -*> LWORK >= N*(1+NB), where NB is the optimal blocksize. +*> The length of WORK. +*> LWORK >= 1, if N <= 1, and LWORK >= 2*N, otherwise. +*> For optimum performance LWORK >= N*(1+NB), where NB is +*> the optimal blocksize, returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -128,7 +130,7 @@ *> \ingroup hetrf_aa * * ===================================================================== - SUBROUTINE SSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO) + SUBROUTINE SSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -142,19 +144,19 @@ * .. * .. Array Arguments .. INTEGER IPIV( * ) - REAL A( LDA, * ), WORK( * ) + REAL A( LDA, * ), WORK( * ) * .. * * ===================================================================== * .. Parameters .. - REAL ZERO, ONE + REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER J, LWKOPT + INTEGER J, LWKMIN, LWKOPT INTEGER NB, MJ, NJ, K1, K2, J1, J2, J3, JB - REAL ALPHA + REAL ALPHA * .. * .. External Functions .. LOGICAL LSAME @@ -180,19 +182,26 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( N.LE.1 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = 2*N + LWKOPT = (NB+1)*N + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = (NB+1)*N - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -204,11 +213,11 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF IPIV( 1 ) = 1 - IF ( N.EQ.1 ) THEN + IF( N.EQ.1 ) THEN RETURN END IF * @@ -458,7 +467,8 @@ END IF * 20 CONTINUE - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of SSYTRF_AA diff --git a/lapack-netlib/SRC/ssytrf_aa_2stage.f b/lapack-netlib/SRC/ssytrf_aa_2stage.f index abe6564c5..6b5cdee1b 100644 --- a/lapack-netlib/SRC/ssytrf_aa_2stage.f +++ b/lapack-netlib/SRC/ssytrf_aa_2stage.f @@ -94,7 +94,7 @@ *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -121,14 +121,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL workspace of size LWORK +*> WORK is REAL workspace of size (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used to +*> select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -212,9 +212,9 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LTB .LT. 4*N .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -6 - ELSE IF ( LWORK .LT. N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.WQUERY ) THEN INFO = -10 END IF * @@ -228,10 +228,10 @@ NB = ILAENV( 1, 'SSYTRF_AA_2STAGE', UPLO, N, -1, -1, -1 ) IF( INFO.EQ.0 ) THEN IF( TQUERY ) THEN - TB( 1 ) = (3*NB+1)*N + TB( 1 ) = SROUNDUP_LWORK( MAX( 1, (3*NB+1)*N ) ) END IF IF( WQUERY ) THEN - WORK( 1 ) = SROUNDUP_LWORK(N*NB) + WORK( 1 ) = SROUNDUP_LWORK( MAX( 1, N*NB ) ) END IF END IF IF( TQUERY .OR. WQUERY ) THEN @@ -240,7 +240,7 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF * diff --git a/lapack-netlib/SRC/ssytrf_rk.f b/lapack-netlib/SRC/ssytrf_rk.f index 72830543c..89ecf38fd 100644 --- a/lapack-netlib/SRC/ssytrf_rk.f +++ b/lapack-netlib/SRC/ssytrf_rk.f @@ -177,14 +177,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension ( MAX(1,LWORK) ). +*> WORK is REAL array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned *> by ILAENV. *> @@ -312,7 +312,7 @@ * NB = ILAENV( 1, 'SSYTRF_RK', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, N*NB ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -488,7 +488,7 @@ * END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of SSYTRF_RK diff --git a/lapack-netlib/SRC/ssytrf_rook.f b/lapack-netlib/SRC/ssytrf_rook.f index 339a229e7..7c2cbbc57 100644 --- a/lapack-netlib/SRC/ssytrf_rook.f +++ b/lapack-netlib/SRC/ssytrf_rook.f @@ -118,7 +118,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine @@ -260,7 +260,7 @@ * NB = ILAENV( 1, 'SSYTRF_ROOK', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, N*NB ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -383,7 +383,8 @@ END IF * 40 CONTINUE - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of SSYTRF_ROOK diff --git a/lapack-netlib/SRC/ssytri2.f b/lapack-netlib/SRC/ssytri2.f index 49f6cad65..fd1c53473 100644 --- a/lapack-netlib/SRC/ssytri2.f +++ b/lapack-netlib/SRC/ssytri2.f @@ -88,16 +88,16 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension (N+NB+1)*(NB+3) +*> WORK is REAL array, dimension (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> WORK is size >= (N+NB+1)*(NB+3) +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> If LWORK = -1, then a workspace query is assumed; the routine -*> calculates: +*> calculates: *> - the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, *> - and no error message related to LWORK is issued by XERBLA. @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYcomputational +*> \ingroup hetri2 * * ===================================================================== SUBROUTINE SSYTRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -147,7 +147,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SSYTRI, SSYTRI2X, XERBLA @@ -159,9 +160,13 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) +* * Get blocksize +* NBMAX = ILAENV( 1, 'SSYTRF', UPLO, N, -1, -1, -1 ) - IF ( NBMAX .GE. N ) THEN + IF( N.EQ.0 ) THEN + MINSIZE = 1 + ELSE IF( NBMAX.GE.N ) THEN MINSIZE = N ELSE MINSIZE = (N+NBMAX+1)*(NBMAX+3) @@ -173,28 +178,29 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF (LWORK .LT. MINSIZE .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.MINSIZE .AND. .NOT.LQUERY ) THEN INFO = -7 END IF -* -* Quick return if possible -* * IF( INFO.NE.0 ) THEN CALL XERBLA( 'SSYTRI2', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK(1)=MINSIZE + WORK( 1 ) = SROUNDUP_LWORK( MINSIZE ) RETURN END IF +* +* Quick return if possible +* IF( N.EQ.0 ) $ RETURN - - IF( NBMAX .GE. N ) THEN +* + IF( NBMAX.GE.N ) THEN CALL SSYTRI( UPLO, N, A, LDA, IPIV, WORK, INFO ) ELSE CALL SSYTRI2X( UPLO, N, A, LDA, IPIV, WORK, NBMAX, INFO ) END IF +* RETURN * * End of SSYTRI2 diff --git a/lapack-netlib/SRC/ssytri_3.f b/lapack-netlib/SRC/ssytri_3.f index bca01105d..f0152a149 100644 --- a/lapack-netlib/SRC/ssytri_3.f +++ b/lapack-netlib/SRC/ssytri_3.f @@ -119,16 +119,17 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension (N+NB+1)*(NB+3). +*> WORK is REAL array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= (N+NB+1)*(NB+3). +*> The length of WORK. +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> -*> If LDWORK = -1, then a workspace query is assumed; +*> If LWORK = -1, then a workspace query is assumed; *> the routine only calculates the optimal size of the optimal *> size of the WORK array, returns this value as the first *> entry of the WORK array, and no error message related to @@ -209,8 +210,13 @@ * * Determine the block size * - NB = MAX( 1, ILAENV( 1, 'SSYTRI_3', UPLO, N, -1, -1, -1 ) ) - LWKOPT = ( N+NB+1 ) * ( NB+3 ) + IF( N.EQ.0 ) THEN + LWKOPT = 1 + ELSE + NB = MAX( 1, ILAENV( 1, 'SSYTRI_3', UPLO, N, -1, -1, -1 ) ) + LWKOPT = ( N+NB+1 ) * ( NB+3 ) + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 @@ -218,7 +224,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LWORK .LT. LWKOPT .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN INFO = -8 END IF * @@ -226,7 +232,6 @@ CALL XERBLA( 'SSYTRI_3', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN END IF * @@ -237,7 +242,7 @@ * CALL SSYTRI_3X( UPLO, N, A, LDA, E, IPIV, WORK, NB, INFO ) * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/ssytrs_aa.f b/lapack-netlib/SRC/ssytrs_aa.f index 12fca0c71..265cf0c1d 100644 --- a/lapack-netlib/SRC/ssytrs_aa.f +++ b/lapack-netlib/SRC/ssytrs_aa.f @@ -105,7 +105,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,3*N-2). +*> The dimension of the array WORK. +*> If MIN(N,NRHS) = 0, LWORK >= 1, else LWORK >= 3*N-2. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the minimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. *> \endverbatim *> *> \param[out] INFO @@ -141,7 +147,7 @@ * .. * .. Array Arguments .. INTEGER IPIV( * ) - REAL A( LDA, * ), B( LDB, * ), WORK( * ) + REAL A( LDA, * ), B( LDB, * ), WORK( * ) * .. * * ===================================================================== @@ -151,24 +157,31 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER K, KP, LWKOPT + INTEGER K, KP, LWKMIN * .. * .. External Functions .. LOGICAL LSAME + EXTERNAL LSAME REAL SROUNDUP_LWORK - EXTERNAL LSAME, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGTSV, SSWAP, SLACPY, STRSM, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX + INTRINSIC MIN, MAX * .. * .. Executable Statements .. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWKMIN = 1 + ELSE + LWKMIN = 3*N-2 + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -179,21 +192,20 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX( 1, 3*N-2 ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SSYTRS_AA', -INFO ) RETURN ELSE IF( LQUERY ) THEN - LWKOPT = (3*N-2) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKMIN ) RETURN END IF * * Quick return if possible * - IF( N.EQ.0 .OR. NRHS.EQ.0 ) + IF( MIN( N, NRHS ).EQ.0 ) $ RETURN * IF( UPPER ) THEN diff --git a/lapack-netlib/SRC/zbbcsd.f b/lapack-netlib/SRC/zbbcsd.f index 757e98c71..6601f4a06 100644 --- a/lapack-netlib/SRC/zbbcsd.f +++ b/lapack-netlib/SRC/zbbcsd.f @@ -804,7 +804,7 @@ CALL DLARTGP( B22BULGE, B22E(I-1), RWORK(IU2SN+I-1), $ RWORK(IU2CS+I-1), R ) ELSE IF( NU .LT. MU ) THEN - CALL DLARTGS( B21E(I), B21E(I+1), NU, RWORK(IU2CS+I-1), + CALL DLARTGS( B21E(I), B21D(I+1), NU, RWORK(IU2CS+I-1), $ RWORK(IU2SN+I-1) ) ELSE CALL DLARTGS( B22D(I), B22E(I), MU, RWORK(IU2CS+I-1), diff --git a/lapack-netlib/SRC/zgebrd.f b/lapack-netlib/SRC/zgebrd.f index f1791c6a4..c1a6169a7 100644 --- a/lapack-netlib/SRC/zgebrd.f +++ b/lapack-netlib/SRC/zgebrd.f @@ -122,7 +122,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= max(1,M,N). +*> The length of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(M,N), otherwise. *> For optimum performance LWORK >= (M+N)*NB, where NB *> is the optimal blocksize. *> @@ -147,7 +148,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup gebrd * *> \par Further Details: * ===================== @@ -223,8 +224,8 @@ * .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKOPT, MINMN, NB, - $ NBMIN, NX, WS + INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKMIN, LWKOPT, + $ MINMN, NB, NBMIN, NX, WS * .. * .. External Subroutines .. EXTERNAL XERBLA, ZGEBD2, ZGEMM, ZLABRD @@ -241,9 +242,17 @@ * Test the input parameters * INFO = 0 - NB = MAX( 1, ILAENV( 1, 'ZGEBRD', ' ', M, N, -1, -1 ) ) - LWKOPT = ( M+N )*NB + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = MAX( M, N ) + NB = MAX( 1, ILAENV( 1, 'ZGEBRD', ' ', M, N, -1, -1 ) ) + LWKOPT = ( M+N )*NB + END IF WORK( 1 ) = DBLE( LWKOPT ) +* LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -251,7 +260,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.LT.0 ) THEN @@ -263,7 +272,6 @@ * * Quick return if possible * - MINMN = MIN( M, N ) IF( MINMN.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -282,7 +290,7 @@ * Determine when to switch from blocked to unblocked code. * IF( NX.LT.MINMN ) THEN - WS = ( M+N )*NB + WS = LWKOPT IF( LWORK.LT.WS ) THEN * * Not enough work space for the optimal NB, consider using diff --git a/lapack-netlib/SRC/zgedmd.f90 b/lapack-netlib/SRC/zgedmd.f90 index 090641ad8..5045cb166 100644 --- a/lapack-netlib/SRC/zgedmd.f90 +++ b/lapack-netlib/SRC/zgedmd.f90 @@ -1,389 +1,539 @@ - SUBROUTINE ZGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & - M, N, X, LDX, Y, LDY, NRNK, TOL, & - K, EIGS, Z, LDZ, RES, B, LDB, & - W, LDW, S, LDS, ZWORK, LZWORK, & - RWORK, LRWORK, IWORK, LIWORK, INFO ) -! March 2023 -!..... - USE iso_fortran_env - IMPLICIT NONE - INTEGER, PARAMETER :: WP = real64 - -!..... -! Scalar arguments - CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF - INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & - NRNK, LDZ, LDB, LDW, LDS, & - LIWORK, LRWORK, LZWORK - INTEGER, INTENT(OUT) :: K, INFO - REAL(KIND=WP), INTENT(IN) :: TOL -! Array arguments - COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) - COMPLEX(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & - W(LDW,*), S(LDS,*) - COMPLEX(KIND=WP), INTENT(OUT) :: EIGS(*) - COMPLEX(KIND=WP), INTENT(OUT) :: ZWORK(*) - REAL(KIND=WP), INTENT(OUT) :: RES(*) - REAL(KIND=WP), INTENT(OUT) :: RWORK(*) - INTEGER, INTENT(OUT) :: IWORK(*) -!............................................................ -! Purpose -! ======= -! ZGEDMD computes the Dynamic Mode Decomposition (DMD) for -! a pair of data snapshot matrices. For the input matrices -! X and Y such that Y = A*X with an unaccessible matrix -! A, ZGEDMD computes a certain number of Ritz pairs of A using -! the standard Rayleigh-Ritz extraction from a subspace of -! range(X) that is determined using the leading left singular -! vectors of X. Optionally, ZGEDMD returns the residuals -! of the computed Ritz pairs, the information needed for -! a refinement of the Ritz vectors, or the eigenvectors of -! the Exact DMD. -! For further details see the references listed -! below. For more details of the implementation see [3]. -! -! References -! ========== -! [1] P. Schmid: Dynamic mode decomposition of numerical -! and experimental data, -! Journal of Fluid Mechanics 656, 5-28, 2010. -! [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal -! decompositions: analysis and enhancements, -! SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. -! [3] Z. Drmac: A LAPACK implementation of the Dynamic -! Mode Decomposition I. Technical report. AIMDyn Inc. -! and LAPACK Working Note 298. -! [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. -! Brunton, N. Kutz: On Dynamic Mode Decomposition: -! Theory and Applications, Journal of Computational -! Dynamics 1(2), 391 -421, 2014. +!> \brief \b ZGEDMD computes the Dynamic Mode Decomposition (DMD) for a pair of data snapshot matrices. +! +! =========== DOCUMENTATION =========== +! +! Definition: +! =========== ! +! SUBROUTINE ZGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & +! M, N, X, LDX, Y, LDY, NRNK, TOL, & +! K, EIGS, Z, LDZ, RES, B, LDB, & +! W, LDW, S, LDS, ZWORK, LZWORK, & +! RWORK, LRWORK, IWORK, LIWORK, INFO ) +!...... +! USE iso_fortran_env +! IMPLICIT NONE +! INTEGER, PARAMETER :: WP = real64 +! +!...... +! Scalar arguments +! CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF +! INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & +! NRNK, LDZ, LDB, LDW, LDS, & +! LIWORK, LRWORK, LZWORK +! INTEGER, INTENT(OUT) :: K, INFO +! REAL(KIND=WP), INTENT(IN) :: TOL +! Array arguments +! COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) +! COMPLEX(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & +! W(LDW,*), S(LDS,*) +! COMPLEX(KIND=WP), INTENT(OUT) :: EIGS(*) +! COMPLEX(KIND=WP), INTENT(OUT) :: ZWORK(*) +! REAL(KIND=WP), INTENT(OUT) :: RES(*) +! REAL(KIND=WP), INTENT(OUT) :: RWORK(*) +! INTEGER, INTENT(OUT) :: IWORK(*) +! +!............................................................ +!> \par Purpose: +! ============= +!> \verbatim +!> ZGEDMD computes the Dynamic Mode Decomposition (DMD) for +!> a pair of data snapshot matrices. For the input matrices +!> X and Y such that Y = A*X with an unaccessible matrix +!> A, ZGEDMD computes a certain number of Ritz pairs of A using +!> the standard Rayleigh-Ritz extraction from a subspace of +!> range(X) that is determined using the leading left singular +!> vectors of X. Optionally, ZGEDMD returns the residuals +!> of the computed Ritz pairs, the information needed for +!> a refinement of the Ritz vectors, or the eigenvectors of +!> the Exact DMD. +!> For further details see the references listed +!> below. For more details of the implementation see [3]. +!> \endverbatim +!............................................................ +!> \par References: +! ================ +!> \verbatim +!> [1] P. Schmid: Dynamic mode decomposition of numerical +!> and experimental data, +!> Journal of Fluid Mechanics 656, 5-28, 2010. +!> [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal +!> decompositions: analysis and enhancements, +!> SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. +!> [3] Z. Drmac: A LAPACK implementation of the Dynamic +!> Mode Decomposition I. Technical report. AIMDyn Inc. +!> and LAPACK Working Note 298. +!> [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. +!> Brunton, N. Kutz: On Dynamic Mode Decomposition: +!> Theory and Applications, Journal of Computational +!> Dynamics 1(2), 391 -421, 2014. +!> \endverbatim !...................................................................... -! Developed and supported by: -! =========================== -! Developed and coded by Zlatko Drmac, Faculty of Science, -! University of Zagreb; drmac@math.hr -! In cooperation with -! AIMdyn Inc., Santa Barbara, CA. -! and supported by -! - DARPA SBIR project "Koopman Operator-Based Forecasting -! for Nonstationary Processes from Near-Term, Limited -! Observational Data" Contract No: W31P4Q-21-C-0007 -! - DARPA PAI project "Physics-Informed Machine Learning -! Methodologies" Contract No: HR0011-18-9-0033 -! - DARPA MoDyL project "A Data-Driven, Operator-Theoretic -! Framework for Space-Time Analysis of Process Dynamics" -! Contract No: HR0011-16-C-0116 -! Any opinions, findings and conclusions or recommendations -! expressed in this material are those of the author and -! do not necessarily reflect the views of the DARPA SBIR -! Program Office -!============================================================ -! Distribution Statement A: -! Approved for Public Release, Distribution Unlimited. -! Cleared by DARPA on September 29, 2022 -!============================================================ +!> \par Developed and supported by: +! ================================ +!> \verbatim +!> Developed and coded by Zlatko Drmac, Faculty of Science, +!> University of Zagreb; drmac@math.hr +!> In cooperation with +!> AIMdyn Inc., Santa Barbara, CA. +!> and supported by +!> - DARPA SBIR project "Koopman Operator-Based Forecasting +!> for Nonstationary Processes from Near-Term, Limited +!> Observational Data" Contract No: W31P4Q-21-C-0007 +!> - DARPA PAI project "Physics-Informed Machine Learning +!> Methodologies" Contract No: HR0011-18-9-0033 +!> - DARPA MoDyL project "A Data-Driven, Operator-Theoretic +!> Framework for Space-Time Analysis of Process Dynamics" +!> Contract No: HR0011-16-C-0116 +!> Any opinions, findings and conclusions or recommendations +!> expressed in this material are those of the author and +!> do not necessarily reflect the views of the DARPA SBIR +!> Program Office +!> \endverbatim +!...................................................................... +!> \par Distribution Statement A: +! ============================== +!> \verbatim +!> Approved for Public Release, Distribution Unlimited. +!> Cleared by DARPA on September 29, 2022 +!> \endverbatim !............................................................ ! Arguments ! ========= -! JOBS (input) CHARACTER*1 -! Determines whether the initial data snapshots are scaled -! by a diagonal matrix. -! 'S' :: The data snapshots matrices X and Y are multiplied -! with a diagonal matrix D so that X*D has unit -! nonzero columns (in the Euclidean 2-norm) -! 'C' :: The snapshots are scaled as with the 'S' option. -! If it is found that an i-th column of X is zero -! vector and the corresponding i-th column of Y is -! non-zero, then the i-th column of Y is set to -! zero and a warning flag is raised. -! 'Y' :: The data snapshots matrices X and Y are multiplied -! by a diagonal matrix D so that Y*D has unit -! nonzero columns (in the Euclidean 2-norm) -! 'N' :: No data scaling. +! +!> \param[in] JOBS +!> \verbatim +!> JOBS (input) CHARACTER*1 +!> Determines whether the initial data snapshots are scaled +!> by a diagonal matrix. +!> 'S' :: The data snapshots matrices X and Y are multiplied +!> with a diagonal matrix D so that X*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'C' :: The snapshots are scaled as with the 'S' option. +!> If it is found that an i-th column of X is zero +!> vector and the corresponding i-th column of Y is +!> non-zero, then the i-th column of Y is set to +!> zero and a warning flag is raised. +!> 'Y' :: The data snapshots matrices X and Y are multiplied +!> by a diagonal matrix D so that Y*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'N' :: No data scaling. +!> \endverbatim !..... -! JOBZ (input) CHARACTER*1 -! Determines whether the eigenvectors (Koopman modes) will -! be computed. -! 'V' :: The eigenvectors (Koopman modes) will be computed -! and returned in the matrix Z. -! See the description of Z. -! 'F' :: The eigenvectors (Koopman modes) will be returned -! in factored form as the product X(:,1:K)*W, where X -! contains a POD basis (leading left singular vectors -! of the data matrix X) and W contains the eigenvectors -! of the corresponding Rayleigh quotient. -! See the descriptions of K, X, W, Z. -! 'N' :: The eigenvectors are not computed. +!> \param[in] JOBZ +!> \verbatim +!> JOBZ (input) CHARACTER*1 +!> Determines whether the eigenvectors (Koopman modes) will +!> be computed. +!> 'V' :: The eigenvectors (Koopman modes) will be computed +!> and returned in the matrix Z. +!> See the description of Z. +!> 'F' :: The eigenvectors (Koopman modes) will be returned +!> in factored form as the product X(:,1:K)*W, where X +!> contains a POD basis (leading left singular vectors +!> of the data matrix X) and W contains the eigenvectors +!> of the corresponding Rayleigh quotient. +!> See the descriptions of K, X, W, Z. +!> 'N' :: The eigenvectors are not computed. +!> \endverbatim !..... -! JOBR (input) CHARACTER*1 -! Determines whether to compute the residuals. -! 'R' :: The residuals for the computed eigenpairs will be -! computed and stored in the array RES. -! See the description of RES. -! For this option to be legal, JOBZ must be 'V'. -! 'N' :: The residuals are not computed. +!> \param[in] JOBR +!> \verbatim +!> JOBR (input) CHARACTER*1 +!> Determines whether to compute the residuals. +!> 'R' :: The residuals for the computed eigenpairs will be +!> computed and stored in the array RES. +!> See the description of RES. +!> For this option to be legal, JOBZ must be 'V'. +!> 'N' :: The residuals are not computed. +!> \endverbatim !..... -! JOBF (input) CHARACTER*1 -! Specifies whether to store information needed for post- -! processing (e.g. computing refined Ritz vectors) -! 'R' :: The matrix needed for the refinement of the Ritz -! vectors is computed and stored in the array B. -! See the description of B. -! 'E' :: The unscaled eigenvectors of the Exact DMD are -! computed and returned in the array B. See the -! description of B. -! 'N' :: No eigenvector refinement data is computed. +!> \param[in] JOBF +!> \verbatim +!> JOBF (input) CHARACTER*1 +!> Specifies whether to store information needed for post- +!> processing (e.g. computing refined Ritz vectors) +!> 'R' :: The matrix needed for the refinement of the Ritz +!> vectors is computed and stored in the array B. +!> See the description of B. +!> 'E' :: The unscaled eigenvectors of the Exact DMD are +!> computed and returned in the array B. See the +!> description of B. +!> 'N' :: No eigenvector refinement data is computed. +!> \endverbatim !..... -! WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } -! Allows for a selection of the SVD algorithm from the -! LAPACK library. -! 1 :: ZGESVD (the QR SVD algorithm) -! 2 :: ZGESDD (the Divide and Conquer algorithm; if enough -! workspace available, this is the fastest option) -! 3 :: ZGESVDQ (the preconditioned QR SVD ; this and 4 -! are the most accurate options) -! 4 :: ZGEJSV (the preconditioned Jacobi SVD; this and 3 -! are the most accurate options) -! For the four methods above, a significant difference in -! the accuracy of small singular values is possible if -! the snapshots vary in norm so that X is severely -! ill-conditioned. If small (smaller than EPS*||X||) -! singular values are of interest and JOBS=='N', then -! the options (3, 4) give the most accurate results, where -! the option 4 is slightly better and with stronger -! theoretical background. -! If JOBS=='S', i.e. the columns of X will be normalized, -! then all methods give nearly equally accurate results. +!> \param[in] WHTSVD +!> \verbatim +!> WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } +!> Allows for a selection of the SVD algorithm from the +!> LAPACK library. +!> 1 :: ZGESVD (the QR SVD algorithm) +!> 2 :: ZGESDD (the Divide and Conquer algorithm; if enough +!> workspace available, this is the fastest option) +!> 3 :: ZGESVDQ (the preconditioned QR SVD ; this and 4 +!> are the most accurate options) +!> 4 :: ZGEJSV (the preconditioned Jacobi SVD; this and 3 +!> are the most accurate options) +!> For the four methods above, a significant difference in +!> the accuracy of small singular values is possible if +!> the snapshots vary in norm so that X is severely +!> ill-conditioned. If small (smaller than EPS*||X||) +!> singular values are of interest and JOBS=='N', then +!> the options (3, 4) give the most accurate results, where +!> the option 4 is slightly better and with stronger +!> theoretical background. +!> If JOBS=='S', i.e. the columns of X will be normalized, +!> then all methods give nearly equally accurate results. +!> \endverbatim !..... -! M (input) INTEGER, M>= 0 -! The state space dimension (the row dimension of X, Y). +!> \param[in] M +!> \verbatim +!> M (input) INTEGER, M>= 0 +!> The state space dimension (the row dimension of X, Y). +!> \endverbatim !..... -! N (input) INTEGER, 0 <= N <= M -! The number of data snapshot pairs -! (the number of columns of X and Y). +!> \param[in] N +!> \verbatim +!> N (input) INTEGER, 0 <= N <= M +!> The number of data snapshot pairs +!> (the number of columns of X and Y). +!> \endverbatim !..... -! X (input/output) COMPLEX(KIND=WP) M-by-N array -! > On entry, X contains the data snapshot matrix X. It is -! assumed that the column norms of X are in the range of -! the normalized floating point numbers. -! < On exit, the leading K columns of X contain a POD basis, -! i.e. the leading K left singular vectors of the input -! data matrix X, U(:,1:K). All N columns of X contain all -! left singular vectors of the input matrix X. -! See the descriptions of K, Z and W. +!> \param[in] LDX +!> \verbatim +!> X (input/output) COMPLEX(KIND=WP) M-by-N array +!> > On entry, X contains the data snapshot matrix X. It is +!> assumed that the column norms of X are in the range of +!> the normalized floating point numbers. +!> < On exit, the leading K columns of X contain a POD basis, +!> i.e. the leading K left singular vectors of the input +!> data matrix X, U(:,1:K). All N columns of X contain all +!> left singular vectors of the input matrix X. +!> See the descriptions of K, Z and W. !..... -! LDX (input) INTEGER, LDX >= M -! The leading dimension of the array X. +!> LDX (input) INTEGER, LDX >= M +!> The leading dimension of the array X. +!> \endverbatim !..... -! Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array -! > On entry, Y contains the data snapshot matrix Y -! < On exit, -! If JOBR == 'R', the leading K columns of Y contain -! the residual vectors for the computed Ritz pairs. -! See the description of RES. -! If JOBR == 'N', Y contains the original input data, -! scaled according to the value of JOBS. +!> \param[in,out] Y +!> \verbatim +!> Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array +!> > On entry, Y contains the data snapshot matrix Y +!> < On exit, +!> If JOBR == 'R', the leading K columns of Y contain +!> the residual vectors for the computed Ritz pairs. +!> See the description of RES. +!> If JOBR == 'N', Y contains the original input data, +!> scaled according to the value of JOBS. +!> \endverbatim !..... -! LDY (input) INTEGER , LDY >= M -! The leading dimension of the array Y. +!> \param[in] LDY +!> \verbatim +!> LDY (input) INTEGER , LDY >= M +!> The leading dimension of the array Y. +!> \endverbatim !..... -! NRNK (input) INTEGER -! Determines the mode how to compute the numerical rank, -! i.e. how to truncate small singular values of the input -! matrix X. On input, if -! NRNK = -1 :: i-th singular value sigma(i) is truncated -! if sigma(i) <= TOL*sigma(1) -! This option is recommended. -! NRNK = -2 :: i-th singular value sigma(i) is truncated -! if sigma(i) <= TOL*sigma(i-1) -! This option is included for R&D purposes. -! It requires highly accurate SVD, which -! may not be feasible. -! The numerical rank can be enforced by using positive -! value of NRNK as follows: -! 0 < NRNK <= N :: at most NRNK largest singular values -! will be used. If the number of the computed nonzero -! singular values is less than NRNK, then only those -! nonzero values will be used and the actually used -! dimension is less than NRNK. The actual number of -! the nonzero singular values is returned in the variable -! K. See the descriptions of TOL and K. +!> \param[in] NRNK +!> \verbatim +!> NRNK (input) INTEGER +!> Determines the mode how to compute the numerical rank, +!> i.e. how to truncate small singular values of the input +!> matrix X. On input, if +!> NRNK = -1 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(1) +!> This option is recommended. +!> NRNK = -2 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(i-1) +!> This option is included for R&D purposes. +!> It requires highly accurate SVD, which +!> may not be feasible. +!> The numerical rank can be enforced by using positive +!> value of NRNK as follows: +!> 0 < NRNK <= N :: at most NRNK largest singular values +!> will be used. If the number of the computed nonzero +!> singular values is less than NRNK, then only those +!> nonzero values will be used and the actually used +!> dimension is less than NRNK. The actual number of +!> the nonzero singular values is returned in the variable +!> K. See the descriptions of TOL and K. +!> \endverbatim !..... -! TOL (input) REAL(KIND=WP), 0 <= TOL < 1 -! The tolerance for truncating small singular values. -! See the description of NRNK. +!> \param[in] TOL +!> \verbatim +!> TOL (input) REAL(KIND=WP), 0 <= TOL < 1 +!> The tolerance for truncating small singular values. +!> See the description of NRNK. +!> \endverbatim !..... -! K (output) INTEGER, 0 <= K <= N -! The dimension of the POD basis for the data snapshot -! matrix X and the number of the computed Ritz pairs. -! The value of K is determined according to the rule set -! by the parameters NRNK and TOL. -! See the descriptions of NRNK and TOL. +!> \param[out] K +!> \verbatim +!> K (output) INTEGER, 0 <= K <= N +!> The dimension of the POD basis for the data snapshot +!> matrix X and the number of the computed Ritz pairs. +!> The value of K is determined according to the rule set +!> by the parameters NRNK and TOL. +!> See the descriptions of NRNK and TOL. +!> \endverbatim !..... -! EIGS (output) COMPLEX(KIND=WP) N-by-1 array -! The leading K (K<=N) entries of EIGS contain -! the computed eigenvalues (Ritz values). -! See the descriptions of K, and Z. +!> \param[out] EIGS +!> \verbatim +!> EIGS (output) COMPLEX(KIND=WP) N-by-1 array +!> The leading K (K<=N) entries of EIGS contain +!> the computed eigenvalues (Ritz values). +!> See the descriptions of K, and Z. +!> \endverbatim !..... -! Z (workspace/output) COMPLEX(KIND=WP) M-by-N array -! If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) -! is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. -! If JOBZ == 'F', then the Z(:,i)'s are given implicitly as -! the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i) -! is an eigenvector corresponding to EIGS(i). The columns -! of W(1:k,1:K) are the computed eigenvectors of the -! K-by-K Rayleigh quotient. -! See the descriptions of EIGS, X and W. +!> \param[out] Z +!> \verbatim +!> Z (workspace/output) COMPLEX(KIND=WP) M-by-N array +!> If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) +!> is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. +!> If JOBZ == 'F', then the Z(:,i)'s are given implicitly as +!> the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i) +!> is an eigenvector corresponding to EIGS(i). The columns +!> of W(1:k,1:K) are the computed eigenvectors of the +!> K-by-K Rayleigh quotient. +!> See the descriptions of EIGS, X and W. +!> \endverbatim !..... -! LDZ (input) INTEGER , LDZ >= M -! The leading dimension of the array Z. +!> \param[in] LDZ +!> \verbatim +!> LDZ (input) INTEGER , LDZ >= M +!> The leading dimension of the array Z. +!> \endverbatim !..... -! RES (output) REAL(KIND=WP) N-by-1 array -! RES(1:K) contains the residuals for the K computed -! Ritz pairs, -! RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. -! See the description of EIGS and Z. +!> \param[out] RES +!> \verbatim +!> RES (output) REAL(KIND=WP) N-by-1 array +!> RES(1:K) contains the residuals for the K computed +!> Ritz pairs, +!> RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. +!> See the description of EIGS and Z. +!> \endverbatim !..... -! B (output) COMPLEX(KIND=WP) M-by-N array. -! IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can -! be used for computing the refined vectors; see further -! details in the provided references. -! If JOBF == 'E', B(1:M,1:K) contains -! A*U(:,1:K)*W(1:K,1:K), which are the vectors from the -! Exact DMD, up to scaling by the inverse eigenvalues. -! If JOBF =='N', then B is not referenced. -! See the descriptions of X, W, K. +!> \param[out] B +!> \verbatim +!> B (output) COMPLEX(KIND=WP) M-by-N array. +!> IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can +!> be used for computing the refined vectors; see further +!> details in the provided references. +!> If JOBF == 'E', B(1:M,1:K) contains +!> A*U(:,1:K)*W(1:K,1:K), which are the vectors from the +!> Exact DMD, up to scaling by the inverse eigenvalues. +!> If JOBF =='N', then B is not referenced. +!> See the descriptions of X, W, K. +!> \endverbatim !..... -! LDB (input) INTEGER, LDB >= M -! The leading dimension of the array B. +!> \param[in] LDB +!> \verbatim +!> LDB (input) INTEGER, LDB >= M +!> The leading dimension of the array B. +!> \endverbatim !..... -! W (workspace/output) COMPLEX(KIND=WP) N-by-N array -! On exit, W(1:K,1:K) contains the K computed -! eigenvectors of the matrix Rayleigh quotient. -! The Ritz vectors (returned in Z) are the -! product of X (containing a POD basis for the input -! matrix X) and W. See the descriptions of K, S, X and Z. -! W is also used as a workspace to temporarily store the -! right singular vectors of X. +!> \param[out] W +!> \verbatim +!> W (workspace/output) COMPLEX(KIND=WP) N-by-N array +!> On exit, W(1:K,1:K) contains the K computed +!> eigenvectors of the matrix Rayleigh quotient. +!> The Ritz vectors (returned in Z) are the +!> product of X (containing a POD basis for the input +!> matrix X) and W. See the descriptions of K, S, X and Z. +!> W is also used as a workspace to temporarily store the +!> right singular vectors of X. +!> \endverbatim !..... -! LDW (input) INTEGER, LDW >= N -! The leading dimension of the array W. +!> \param[in] LDW +!> \verbatim +!> LDW (input) INTEGER, LDW >= N +!> The leading dimension of the array W. +!> \endverbatim !..... -! S (workspace/output) COMPLEX(KIND=WP) N-by-N array -! The array S(1:K,1:K) is used for the matrix Rayleigh -! quotient. This content is overwritten during -! the eigenvalue decomposition by ZGEEV. -! See the description of K. +!> \param[out] S +!> \verbatim +!> S (workspace/output) COMPLEX(KIND=WP) N-by-N array +!> The array S(1:K,1:K) is used for the matrix Rayleigh +!> quotient. This content is overwritten during +!> the eigenvalue decomposition by ZGEEV. +!> See the description of K. +!> \endverbatim !..... -! LDS (input) INTEGER, LDS >= N -! The leading dimension of the array S. +!> \param[in] LDS +!> \verbatim +!> LDS (input) INTEGER, LDS >= N +!> The leading dimension of the array S. +!> \endverbatim !..... -! ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array -! ZWORK is used as complex workspace in the complex SVD, as -! specified by WHTSVD (1,2, 3 or 4) and for ZGEEV for computing -! the eigenvalues of a Rayleigh quotient. -! If the call to ZGEDMD is only workspace query, then -! ZWORK(1) contains the minimal complex workspace length and -! ZWORK(2) is the optimal complex workspace length. -! Hence, the length of work is at least 2. -! See the description of LZWORK. +!> \param[out] ZWORK +!> \verbatim +!> ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array +!> ZWORK is used as complex workspace in the complex SVD, as +!> specified by WHTSVD (1,2, 3 or 4) and for ZGEEV for computing +!> the eigenvalues of a Rayleigh quotient. +!> If the call to ZGEDMD is only workspace query, then +!> ZWORK(1) contains the minimal complex workspace length and +!> ZWORK(2) is the optimal complex workspace length. +!> Hence, the length of work is at least 2. +!> See the description of LZWORK. +!> \endverbatim !..... -! LZWORK (input) INTEGER -! The minimal length of the workspace vector ZWORK. -! LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_ZGEEV), -! where LZWORK_ZGEEV = MAX( 1, 2*N ) and the minimal -! LZWORK_SVD is calculated as follows -! If WHTSVD == 1 :: ZGESVD :: -! LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N)) -! If WHTSVD == 2 :: ZGESDD :: -! LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) -! If WHTSVD == 3 :: ZGESVDQ :: -! LZWORK_SVD = obtainable by a query -! If WHTSVD == 4 :: ZGEJSV :: -! LZWORK_SVD = obtainable by a query -! If on entry LZWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! and the optimal workspace lengths and returns them in -! LZWORK(1) and LZWORK(2), respectively. +!> \param[in] LZWORK +!> \verbatim +!> LZWORK (input) INTEGER +!> The minimal length of the workspace vector ZWORK. +!> LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_ZGEEV), +!> where LZWORK_ZGEEV = MAX( 1, 2*N ) and the minimal +!> LZWORK_SVD is calculated as follows +!> If WHTSVD == 1 :: ZGESVD :: +!> LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N)) +!> If WHTSVD == 2 :: ZGESDD :: +!> LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) +!> If WHTSVD == 3 :: ZGESVDQ :: +!> LZWORK_SVD = obtainable by a query +!> If WHTSVD == 4 :: ZGEJSV :: +!> LZWORK_SVD = obtainable by a query +!> If on entry LZWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths and returns them in +!> LZWORK(1) and LZWORK(2), respectively. +!> \endverbatim !..... -! RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array -! On exit, RWORK(1:N) contains the singular values of -! X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). -! If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain -! scaling factor RWORK(N+2)/RWORK(N+1) used to scale X -! and Y to avoid overflow in the SVD of X. -! This may be of interest if the scaling option is off -! and as many as possible smallest eigenvalues are -! desired to the highest feasible accuracy. -! If the call to ZGEDMD is only workspace query, then -! RWORK(1) contains the minimal workspace length. -! See the description of LRWORK. +!> \param[out] RWORK +!> \verbatim +!> RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array +!> On exit, RWORK(1:N) contains the singular values of +!> X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). +!> If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain +!> scaling factor RWORK(N+2)/RWORK(N+1) used to scale X +!> and Y to avoid overflow in the SVD of X. +!> This may be of interest if the scaling option is off +!> and as many as possible smallest eigenvalues are +!> desired to the highest feasible accuracy. +!> If the call to ZGEDMD is only workspace query, then +!> RWORK(1) contains the minimal workspace length. +!> See the description of LRWORK. +!> \endverbatim !..... -! LRWORK (input) INTEGER -! The minimal length of the workspace vector RWORK. -! LRWORK is calculated as follows: -! LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_ZGEEV), where -! LRWORK_ZGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace -! for the SVD subroutine determined by the input parameter -! WHTSVD. -! If WHTSVD == 1 :: ZGESVD :: -! LRWORK_SVD = 5*MIN(M,N) -! If WHTSVD == 2 :: ZGESDD :: -! LRWORK_SVD = MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), -! 2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) -! If WHTSVD == 3 :: ZGESVDQ :: -! LRWORK_SVD = obtainable by a query -! If WHTSVD == 4 :: ZGEJSV :: -! LRWORK_SVD = obtainable by a query -! If on entry LRWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! real workspace length and returns it in RWORK(1). +!> \param[in] LRWORK +!> \verbatim +!> LRWORK (input) INTEGER +!> The minimal length of the workspace vector RWORK. +!> LRWORK is calculated as follows: +!> LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_ZGEEV), where +!> LRWORK_ZGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace +!> for the SVD subroutine determined by the input parameter +!> WHTSVD. +!> If WHTSVD == 1 :: ZGESVD :: +!> LRWORK_SVD = 5*MIN(M,N) +!> If WHTSVD == 2 :: ZGESDD :: +!> LRWORK_SVD = MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), +!> 2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) +!> If WHTSVD == 3 :: ZGESVDQ :: +!> LRWORK_SVD = obtainable by a query +!> If WHTSVD == 4 :: ZGEJSV :: +!> LRWORK_SVD = obtainable by a query +!> If on entry LRWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> real workspace length and returns it in RWORK(1). +!> \endverbatim !..... -! IWORK (workspace/output) INTEGER LIWORK-by-1 array -! Workspace that is required only if WHTSVD equals -! 2 , 3 or 4. (See the description of WHTSVD). -! If on entry LWORK =-1 or LIWORK=-1, then the -! minimal length of IWORK is computed and returned in -! IWORK(1). See the description of LIWORK. +!> \param[out] IWORK +!> \verbatim +!> IWORK (workspace/output) INTEGER LIWORK-by-1 array +!> Workspace that is required only if WHTSVD equals +!> 2 , 3 or 4. (See the description of WHTSVD). +!> If on entry LWORK =-1 or LIWORK=-1, then the +!> minimal length of IWORK is computed and returned in +!> IWORK(1). See the description of LIWORK. +!> \endverbatim !..... -! LIWORK (input) INTEGER -! The minimal length of the workspace vector IWORK. -! If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 -! If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) -! If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) -! If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) -! If on entry LIWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! and the optimal workspace lengths for ZWORK, RWORK and -! IWORK. See the descriptions of ZWORK, RWORK and IWORK. +!> \param[in] LIWORK +!> \verbatim +!> LIWORK (input) INTEGER +!> The minimal length of the workspace vector IWORK. +!> If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 +!> If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) +!> If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) +!> If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) +!> If on entry LIWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths for ZWORK, RWORK and +!> IWORK. See the descriptions of ZWORK, RWORK and IWORK. +!> \endverbatim !..... -! INFO (output) INTEGER -! -i < 0 :: On entry, the i-th argument had an -! illegal value -! = 0 :: Successful return. -! = 1 :: Void input. Quick exit (M=0 or N=0). -! = 2 :: The SVD computation of X did not converge. -! Suggestion: Check the input data and/or -! repeat with different WHTSVD. -! = 3 :: The computation of the eigenvalues did not -! converge. -! = 4 :: If data scaling was requested on input and -! the procedure found inconsistency in the data -! such that for some column index i, -! X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set -! to zero if JOBS=='C'. The computation proceeds -! with original or modified data and warning -! flag is set with INFO=4. +!> \param[out] INFO +!> \verbatim +!> INFO (output) INTEGER +!> -i < 0 :: On entry, the i-th argument had an +!> illegal value +!> = 0 :: Successful return. +!> = 1 :: Void input. Quick exit (M=0 or N=0). +!> = 2 :: The SVD computation of X did not converge. +!> Suggestion: Check the input data and/or +!> repeat with different WHTSVD. +!> = 3 :: The computation of the eigenvalues did not +!> converge. +!> = 4 :: If data scaling was requested on input and +!> the procedure found inconsistency in the data +!> such that for some column index i, +!> X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set +!> to zero if JOBS=='C'. The computation proceeds +!> with original or modified data and warning +!> flag is set with INFO=4. +!> \endverbatim +! +! Authors: +! ======== +! +!> \author Zlatko Drmac +! +!> \ingroup gedmd +! !............................................................. !............................................................. + SUBROUTINE ZGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & + M, N, X, LDX, Y, LDY, NRNK, TOL, & + K, EIGS, Z, LDZ, RES, B, LDB, & + W, LDW, S, LDS, ZWORK, LZWORK, & + RWORK, LRWORK, IWORK, LIWORK, INFO ) +! +! -- LAPACK driver routine -- +! +! -- LAPACK is a software package provided by University of -- +! -- Tennessee, University of California Berkeley, University of -- +! -- Colorado Denver and NAG Ltd.. -- +! +!..... + USE iso_fortran_env + IMPLICIT NONE + INTEGER, PARAMETER :: WP = real64 +! +! Scalar arguments +! ~~~~~~~~~~~~~~~~ + CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF + INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & + NRNK, LDZ, LDB, LDW, LDS, & + LIWORK, LRWORK, LZWORK + INTEGER, INTENT(OUT) :: K, INFO + REAL(KIND=WP), INTENT(IN) :: TOL +! +! Array arguments +! ~~~~~~~~~~~~~~~ + COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) + COMPLEX(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & + W(LDW,*), S(LDS,*) + COMPLEX(KIND=WP), INTENT(OUT) :: EIGS(*) + COMPLEX(KIND=WP), INTENT(OUT) :: ZWORK(*) + REAL(KIND=WP), INTENT(OUT) :: RES(*) + REAL(KIND=WP), INTENT(OUT) :: RWORK(*) + INTEGER, INTENT(OUT) :: IWORK(*) +! ! Parameters ! ~~~~~~~~~~ REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP COMPLEX(KIND=WP), PARAMETER :: ZONE = ( 1.0_WP, 0.0_WP ) COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP ) - +! ! Local scalars ! ~~~~~~~~~~~~~ REAL(KIND=WP) :: OFL, ROOTSC, SCALE, SMALL, & @@ -401,7 +551,7 @@ ! Local arrays ! ~~~~~~~~~~~~ REAL(KIND=WP) :: RDUMMY(2) - +! ! External functions (BLAS and LAPACK) ! ~~~~~~~~~~~~~~~~~ REAL(KIND=WP) ZLANGE, DLAMCH, DZNRM2 @@ -409,13 +559,13 @@ INTEGER IZAMAX LOGICAL DISNAN, LSAME EXTERNAL DISNAN, LSAME - +! ! External subroutines (BLAS and LAPACK) ! ~~~~~~~~~~~~~~~~~~~~ EXTERNAL ZAXPY, ZGEMM, ZDSCAL EXTERNAL ZGEEV, ZGEJSV, ZGESDD, ZGESVD, ZGESVDQ, & ZLACPY, ZLASCL, ZLASSQ, XERBLA - +! ! Intrinsic functions ! ~~~~~~~~~~~~~~~~~~~ INTRINSIC DBLE, INT, MAX, SQRT @@ -608,7 +758,8 @@ K = 0 DO i = 1, N !WORK(i) = DZNRM2( M, X(1,i), 1 ) - SCALE = ZERO + SSUM = ONE + SCALE = ZERO CALL ZLASSQ( M, X(1,i), 1, SCALE, SSUM ) IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN K = 0 @@ -681,7 +832,8 @@ ! carefully computed using ZLASSQ. DO i = 1, N !RWORK(i) = DZNRM2( M, Y(1,i), 1 ) - SCALE = ZERO + SSUM = ONE + SCALE = ZERO CALL ZLASSQ( M, Y(1,i), 1, SCALE, SSUM ) IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN K = 0 diff --git a/lapack-netlib/SRC/zgehrd.f b/lapack-netlib/SRC/zgehrd.f index e18493cf9..0f4424ded 100644 --- a/lapack-netlib/SRC/zgehrd.f +++ b/lapack-netlib/SRC/zgehrd.f @@ -89,7 +89,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension (LWORK) +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup gehrd * *> \par Further Details: * ===================== @@ -173,7 +173,7 @@ INTEGER IHI, ILO, INFO, LDA, LWORK, N * .. * .. Array Arguments .. - COMPLEX*16 A( LDA, * ), TAU( * ), WORK( * ) + COMPLEX*16 A( LDA, * ), TAU( * ), WORK( * ) * .. * * ===================================================================== @@ -182,7 +182,7 @@ INTEGER NBMAX, LDT, TSIZE PARAMETER ( NBMAX = 64, LDT = NBMAX+1, $ TSIZE = LDT*NBMAX ) - COMPLEX*16 ZERO, ONE + COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ), $ ONE = ( 1.0D+0, 0.0D+0 ) ) * .. @@ -190,7 +190,7 @@ LOGICAL LQUERY INTEGER I, IB, IINFO, IWT, J, LDWORK, LWKOPT, NB, $ NBMIN, NH, NX - COMPLEX*16 EI + COMPLEX*16 EI * .. * .. External Subroutines .. EXTERNAL ZAXPY, ZGEHD2, ZGEMM, ZLAHR2, ZLARFB, ZTRMM, @@ -221,12 +221,18 @@ INFO = -8 END IF * + NH = IHI - ILO + 1 IF( INFO.EQ.0 ) THEN * * Compute the workspace requirements * - NB = MIN( NBMAX, ILAENV( 1, 'ZGEHRD', ' ', N, ILO, IHI, -1 ) ) - LWKOPT = N*NB + TSIZE + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + NB = MIN( NBMAX, ILAENV( 1, 'ZGEHRD', ' ', N, ILO, IHI, + $ -1 ) ) + LWKOPT = N*NB + TSIZE + END IF WORK( 1 ) = LWKOPT ENDIF * @@ -248,7 +254,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = 1 RETURN @@ -268,7 +273,7 @@ * * Determine if workspace is large enough for blocked code * - IF( LWORK.LT.N*NB+TSIZE ) THEN + IF( LWORK.LT.LWKOPT ) THEN * * Not enough workspace to use optimal NB: determine the * minimum value of NB, and reduce NB or force use of diff --git a/lapack-netlib/SRC/zgelq.f b/lapack-netlib/SRC/zgelq.f index de7c9a378..86610e801 100644 --- a/lapack-netlib/SRC/zgelq.f +++ b/lapack-netlib/SRC/zgelq.f @@ -98,7 +98,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -166,6 +166,8 @@ *> the LQ factorization. *> \endverbatim *> +*> \ingroup gelq +*> * ===================================================================== SUBROUTINE ZGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) diff --git a/lapack-netlib/SRC/zgelqf.f b/lapack-netlib/SRC/zgelqf.f index 6c295eece..e988ea818 100644 --- a/lapack-netlib/SRC/zgelqf.f +++ b/lapack-netlib/SRC/zgelqf.f @@ -93,7 +93,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,M). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M, otherwise. *> For optimum performance LWORK >= M*NB, where NB is the *> optimal blocksize. *> @@ -118,7 +119,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup gelqf * *> \par Further Details: * ===================== @@ -174,9 +175,8 @@ * Test the input arguments * INFO = 0 + K = MIN( M, N ) NB = ILAENV( 1, 'ZGELQF', ' ', M, N, -1, -1 ) - LWKOPT = M*NB - WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -184,19 +184,25 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + ELSE IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) ) + $ INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZGELQF', -INFO ) RETURN ELSE IF( LQUERY ) THEN + IF( K.EQ.0 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = M*NB + END IF + WORK( 1 ) = LWKOPT RETURN END IF * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN diff --git a/lapack-netlib/SRC/zgemlq.f b/lapack-netlib/SRC/zgemlq.f index 41cd1c059..11489087a 100644 --- a/lapack-netlib/SRC/zgemlq.f +++ b/lapack-netlib/SRC/zgemlq.f @@ -109,16 +109,17 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -142,7 +143,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -158,11 +159,13 @@ *> block sizes MB and NB returned by ILAENV, ZGELQ will use either *> ZLASWLQ (if the matrix is wide-and-short) or ZGELQT to compute *> the LQ factorization. -*> This version of ZGEMLQ will use either ZLAMSWLQ or ZGEMLQT to +*> This version of ZGEMLQ will use either ZLAMSWLQ or ZGEMLQT to *> multiply matrix Q by another matrix. *> Further Details in ZLAMSWLQ or ZGEMLQT. *> \endverbatim *> +*> \ingroup gemlq +*> * ===================================================================== SUBROUTINE ZGEMLQ( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -184,7 +187,7 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -200,7 +203,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) @@ -215,6 +218,13 @@ LW = M * MB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( NB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, NB - K ) .EQ. 0 ) THEN @@ -243,7 +253,7 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( ( LWORK.LT.LWMIN ) .AND. ( .NOT.LQUERY ) ) THEN INFO = -13 END IF * @@ -260,7 +270,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * diff --git a/lapack-netlib/SRC/zgemqr.f b/lapack-netlib/SRC/zgemqr.f index c83eaff2f..d14d74fe2 100644 --- a/lapack-netlib/SRC/zgemqr.f +++ b/lapack-netlib/SRC/zgemqr.f @@ -111,16 +111,17 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -144,7 +145,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -166,6 +167,8 @@ *> *> \endverbatim *> +*> \ingroup gemqr +*> * ===================================================================== SUBROUTINE ZGEMQR( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -187,7 +190,7 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -203,7 +206,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) @@ -218,6 +221,13 @@ LW = MB * NB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( MB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, MB - K ).EQ.0 ) THEN @@ -246,12 +256,12 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LW + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN @@ -263,7 +273,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -276,7 +286,7 @@ $ NB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = LW + WORK( 1 ) = LWMIN * RETURN * diff --git a/lapack-netlib/SRC/zgeqlf.f b/lapack-netlib/SRC/zgeqlf.f index 94721540c..a27612c64 100644 --- a/lapack-netlib/SRC/zgeqlf.f +++ b/lapack-netlib/SRC/zgeqlf.f @@ -88,7 +88,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is *> the optimal blocksize. *> @@ -113,7 +114,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup geqlf * *> \par Further Details: * ===================== @@ -188,8 +189,9 @@ END IF WORK( 1 ) = LWKOPT * - IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) ) + $ INFO = -7 END IF END IF * diff --git a/lapack-netlib/SRC/zgeqp3rk.f b/lapack-netlib/SRC/zgeqp3rk.f index 247a3c379..01dcce0de 100644 --- a/lapack-netlib/SRC/zgeqp3rk.f +++ b/lapack-netlib/SRC/zgeqp3rk.f @@ -428,7 +428,8 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*. LWORK >= N+NRHS-1 +*> LWORK >= 1, if MIN(M,N) = 0, and +*> LWORK >= N+NRHS-1, otherwise. *> For optimal performance LWORK >= NB*( N+NRHS+1 ), *> where NB is the optimal block size for ZGEQP3RK returned *> by ILAENV. Minimal block size MINNB=2. diff --git a/lapack-netlib/SRC/zgeqr.f b/lapack-netlib/SRC/zgeqr.f index 20a80d083..7df9c2403 100644 --- a/lapack-netlib/SRC/zgeqr.f +++ b/lapack-netlib/SRC/zgeqr.f @@ -99,7 +99,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -168,6 +168,8 @@ *> *> \endverbatim *> +*> \ingroup geqr +*> * ===================================================================== SUBROUTINE ZGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) @@ -188,7 +190,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, LMINWS, MINT, MINW - INTEGER MB, NB, MINTSZ, NBLCKS + INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWREQ * .. * .. External Functions .. LOGICAL LSAME @@ -244,8 +246,10 @@ * * Determine if the workspace size satisfies minimal size * + LWMIN = MAX( 1, N ) + LWREQ = MAX( 1, N*NB ) LMINWS = .FALSE. - IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.NB*N ) + IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.LWREQ ) $ .AND. ( LWORK.GE.N ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( .NOT.LQUERY ) ) THEN IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) ) THEN @@ -253,7 +257,7 @@ NB = 1 MB = M END IF - IF( LWORK.LT.NB*N ) THEN + IF( LWORK.LT.LWREQ ) THEN LMINWS = .TRUE. NB = 1 END IF @@ -268,7 +272,7 @@ ELSE IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN INFO = -6 - ELSE IF( ( LWORK.LT.MAX( 1, N*NB ) ) .AND. ( .NOT.LQUERY ) + ELSE IF( ( LWORK.LT.LWREQ ) .AND. ( .NOT.LQUERY ) $ .AND. ( .NOT.LMINWS ) ) THEN INFO = -8 END IF @@ -282,9 +286,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = MAX( 1, N ) + WORK( 1 ) = LWMIN ELSE - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = LWREQ END IF END IF IF( INFO.NE.0 ) THEN @@ -309,7 +313,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = LWREQ * RETURN * diff --git a/lapack-netlib/SRC/zgeqrfp.f b/lapack-netlib/SRC/zgeqrfp.f index 73bcde667..3562de36e 100644 --- a/lapack-netlib/SRC/zgeqrfp.f +++ b/lapack-netlib/SRC/zgeqrfp.f @@ -97,7 +97,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is *> the optimal blocksize. *> @@ -122,7 +123,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup geqrfp * *> \par Further Details: * ===================== @@ -162,8 +163,8 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKOPT, NB, - $ NBMIN, NX + INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKMIN, LWKOPT, + $ NB, NBMIN, NX * .. * .. External Subroutines .. EXTERNAL XERBLA, ZGEQR2P, ZLARFB, ZLARFT @@ -181,8 +182,16 @@ * INFO = 0 NB = ILAENV( 1, 'ZGEQRF', ' ', M, N, -1, -1 ) - LWKOPT = N*NB + K = MIN( M, N ) + IF( K.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = N + LWKOPT = N*NB + END IF WORK( 1 ) = LWKOPT +* LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -190,7 +199,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF IF( INFO.NE.0 ) THEN @@ -202,7 +211,6 @@ * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -210,7 +218,7 @@ * NBMIN = 2 NX = 0 - IWS = N + IWS = LWKMIN IF( NB.GT.1 .AND. NB.LT.K ) THEN * * Determine when to cross over from blocked to unblocked code. diff --git a/lapack-netlib/SRC/zgesvj.f b/lapack-netlib/SRC/zgesvj.f index 6cd2335f2..2be45d826 100644 --- a/lapack-netlib/SRC/zgesvj.f +++ b/lapack-netlib/SRC/zgesvj.f @@ -200,23 +200,25 @@ *> \verbatim *> LDV is INTEGER *> The leading dimension of the array V, LDV >= 1. -*> If JOBV = 'V', then LDV >= max(1,N). -*> If JOBV = 'A', then LDV >= max(1,MV) . +*> If JOBV = 'V', then LDV >= MAX(1,N). +*> If JOBV = 'A', then LDV >= MAX(1,MV) . *> \endverbatim *> *> \param[in,out] CWORK *> \verbatim -*> CWORK is COMPLEX*16 array, dimension (max(1,LWORK)) +*> CWORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) *> Used as workspace. -*> If on entry LWORK = -1, then a workspace query is assumed and -*> no computation is done; CWORK(1) is set to the minial (and optimal) -*> length of CWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER. -*> Length of CWORK, LWORK >= M+N. +*> Length of CWORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M+N, otherwise. +*> +*> If on entry LWORK = -1, then a workspace query is assumed and +*> no computation is done; CWORK(1) is set to the minial (and optimal) +*> length of CWORK. *> \endverbatim *> *> \param[in,out] RWORK @@ -247,15 +249,17 @@ *> RWORK(6) = the largest absolute value over all sines of the *> Jacobi rotation angles in the last sweep. It can be *> useful for a post festum analysis. -*> If on entry LRWORK = -1, then a workspace query is assumed and -*> no computation is done; RWORK(1) is set to the minial (and optimal) -*> length of RWORK. *> \endverbatim *> *> \param[in] LRWORK *> \verbatim *> LRWORK is INTEGER -*> Length of RWORK, LRWORK >= MAX(6,N). +*> Length of RWORK. +*> LRWORK >= 1, if MIN(M,N) = 0, and LRWORK >= MAX(6,N), otherwise. +*> +*> If on entry LRWORK = -1, then a workspace query is assumed and +*> no computation is done; RWORK(1) is set to the minial (and optimal) +*> length of RWORK. *> \endverbatim *> *> \param[out] INFO @@ -276,7 +280,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup gesvj * *> \par Further Details: * ===================== @@ -367,23 +371,25 @@ * * .. Local Parameters .. DOUBLE PRECISION ZERO, HALF, ONE - PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0) - COMPLEX*16 CZERO, CONE - PARAMETER ( CZERO = (0.0D0, 0.0D0), CONE = (1.0D0, 0.0D0) ) - INTEGER NSWEEP - PARAMETER ( NSWEEP = 30 ) + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0) + COMPLEX*16 CZERO, CONE + PARAMETER ( CZERO = (0.0D0, 0.0D0), CONE = (1.0D0, 0.0D0) ) + INTEGER NSWEEP + PARAMETER ( NSWEEP = 30 ) * .. * .. Local Scalars .. - COMPLEX*16 AAPQ, OMPQ - DOUBLE PRECISION AAPP, AAPP0, AAPQ1, AAQQ, APOAQ, AQOAP, BIG, - $ BIGTHETA, CS, CTOL, EPSLN, MXAAPQ, - $ MXSINJ, ROOTBIG, ROOTEPS, ROOTSFMIN, ROOTTOL, - $ SKL, SFMIN, SMALL, SN, T, TEMP1, THETA, THSIGN, TOL - INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1, - $ ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34, - $ N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, SWBAND - LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, ROTOK, - $ RSVEC, UCTOL, UPPER + COMPLEX*16 AAPQ, OMPQ + DOUBLE PRECISION AAPP, AAPP0, AAPQ1, AAQQ, APOAQ, AQOAP, BIG, + $ BIGTHETA, CS, CTOL, EPSLN, MXAAPQ, + $ MXSINJ, ROOTBIG, ROOTEPS, ROOTSFMIN, ROOTTOL, + $ SKL, SFMIN, SMALL, SN, T, TEMP1, THETA, THSIGN, + $ TOL + INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1, + $ ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34, + $ N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, + $ SWBAND, MINMN, LWMIN, LRWMIN + LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, + $ ROTOK, RSVEC, UCTOL, UPPER * .. * .. * .. Intrinsic Functions .. @@ -422,7 +428,16 @@ UPPER = LSAME( JOBA, 'U' ) LOWER = LSAME( JOBA, 'L' ) * - LQUERY = ( LWORK .EQ. -1 ) .OR. ( LRWORK .EQ. -1 ) + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + LRWMIN = 1 + ELSE + LWMIN = M+N + LRWMIN = MAX( 6, N ) + END IF +* + LQUERY = ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 ) IF( .NOT.( UPPER .OR. LOWER .OR. LSAME( JOBA, 'G' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( LSVEC .OR. UCTOL .OR. LSAME( JOBU, 'N' ) ) ) THEN @@ -442,9 +457,9 @@ INFO = -11 ELSE IF( UCTOL .AND. ( RWORK( 1 ).LE.ONE ) ) THEN INFO = -12 - ELSE IF( ( LWORK.LT.( M+N ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. ( .NOT.LQUERY ) ) THEN INFO = -13 - ELSE IF( ( LRWORK.LT.MAX( N, 6 ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LRWORK.LT.LRWMIN .AND. ( .NOT.LQUERY ) ) THEN INFO = -15 ELSE INFO = 0 @@ -454,15 +469,15 @@ IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZGESVJ', -INFO ) RETURN - ELSE IF ( LQUERY ) THEN - CWORK(1) = M + N - RWORK(1) = MAX( N, 6 ) + ELSE IF( LQUERY ) THEN + CWORK( 1 ) = LWMIN + RWORK( 1 ) = LRWMIN RETURN END IF * * #:) Quick return for void matrix * - IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) )RETURN + IF( MINMN.EQ.0 ) RETURN * * Set numerical parameters * The stopping criterion for Jacobi rotations is diff --git a/lapack-netlib/SRC/zgetri.f b/lapack-netlib/SRC/zgetri.f index 41782841c..f3806a77c 100644 --- a/lapack-netlib/SRC/zgetri.f +++ b/lapack-netlib/SRC/zgetri.f @@ -107,7 +107,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup getri * * ===================================================================== SUBROUTINE ZGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -152,7 +152,7 @@ * INFO = 0 NB = ILAENV( 1, 'ZGETRI', ' ', N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN diff --git a/lapack-netlib/SRC/zgetsls.f b/lapack-netlib/SRC/zgetsls.f index 17c6d5146..26311c611 100644 --- a/lapack-netlib/SRC/zgetsls.f +++ b/lapack-netlib/SRC/zgetsls.f @@ -127,7 +127,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. *> If LWORK = -1, the routine calculates optimal size of WORK for the *> optimal performance and returns this value in WORK(1). @@ -154,7 +154,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEsolve +*> \ingroup getsls * * ===================================================================== SUBROUTINE ZGETSLS( TRANS, M, N, NRHS, A, LDA, B, LDB, @@ -192,7 +192,7 @@ * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION DLAMCH, ZLANGE - EXTERNAL LSAME, DLABAD, DLAMCH, ZLANGE + EXTERNAL LSAME, DLAMCH, ZLANGE * .. * .. External Subroutines .. EXTERNAL ZGEQR, ZGEMQR, ZLASCL, ZLASET, @@ -229,7 +229,10 @@ * * Determine the optimum and minimum LWORK * - IF( M.GE.N ) THEN + IF( MIN( M, N, NRHS ).EQ.0 ) THEN + WSIZEO = 1 + WSIZEM = 1 + ELSE IF( M.GE.N ) THEN CALL ZGEQR( M, N, A, LDA, TQ, -1, WORKQ, -1, INFO2 ) TSZO = INT( TQ( 1 ) ) LWO = INT( WORKQ( 1 ) ) @@ -297,7 +300,6 @@ * SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) * * Scale A, B if max element outside range [SMLNUM,BIGNUM] * diff --git a/lapack-netlib/SRC/zgetsqrhrt.f b/lapack-netlib/SRC/zgetsqrhrt.f index 5f0167937..e7ce993aa 100644 --- a/lapack-netlib/SRC/zgetsqrhrt.f +++ b/lapack-netlib/SRC/zgetsqrhrt.f @@ -131,13 +131,15 @@ *> \param[in] LWORK *> \verbatim *> The dimension of the array WORK. -*> LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), +*> If MIN(M,N) = 0, LWORK >= 1, else +*> LWORK >= MAX( 1, LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), *> where *> NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)), *> NB1LOCAL = MIN(NB1,N). *> LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL, *> LW1 = NB1LOCAL * N, -*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ), +*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ). +*> *> If LWORK = -1, then a workspace query is assumed. *> The routine only calculates the optimal size of the WORK *> array, returns this value as the first entry of the WORK @@ -160,7 +162,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup comlpex16OTHERcomputational +*> \ingroup getsqrhrt * *> \par Contributors: * ================== @@ -212,7 +214,7 @@ * Test the input arguments * INFO = 0 - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. M.LT.N ) THEN @@ -225,7 +227,7 @@ INFO = -5 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -7 - ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN + ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN INFO = -9 ELSE * @@ -263,8 +265,9 @@ LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ) * LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ) + LWORKOPT = MAX( 1, LWORKOPT ) * - IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + IF( LWORK.LT.LWORKOPT .AND. .NOT.LQUERY ) THEN INFO = -11 END IF * diff --git a/lapack-netlib/SRC/zgges3.f b/lapack-netlib/SRC/zgges3.f index 8b3e44f88..8235c2543 100644 --- a/lapack-netlib/SRC/zgges3.f +++ b/lapack-netlib/SRC/zgges3.f @@ -215,7 +215,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= MAX(1,2*N) +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -260,7 +261,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEeigen +*> \ingroup gges3 * * ===================================================================== SUBROUTINE ZGGES3( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B, @@ -300,7 +301,8 @@ LOGICAL CURSL, ILASCL, ILBSCL, ILVSL, ILVSR, LASTSL, $ LQUERY, WANTST INTEGER I, ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, - $ ILO, IRIGHT, IROWS, IRWRK, ITAU, IWRK, LWKOPT + $ ILO, IRIGHT, IROWS, IRWRK, ITAU, IWRK, LWKOPT, + $ LWKMIN DOUBLE PRECISION ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, PVSL, $ PVSR, SMLNUM * .. @@ -309,9 +311,8 @@ DOUBLE PRECISION DIF( 2 ) * .. * .. External Subroutines .. - EXTERNAL DLABAD, XERBLA, ZGEQRF, ZGGBAK, ZGGBAL, ZGGHD3, - $ ZLAQZ0, ZLACPY, ZLASCL, ZLASET, ZTGSEN, ZUNGQR, - $ ZUNMQR + EXTERNAL XERBLA, ZGEQRF, ZGGBAK, ZGGBAL, ZGGHD3, ZLAQZ0, + $ ZLACPY, ZLASCL, ZLASET, ZTGSEN, ZUNGQR, ZUNMQR * .. * .. External Functions .. LOGICAL LSAME @@ -353,6 +354,8 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N ) +* IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -369,7 +372,7 @@ INFO = -14 ELSE IF( LDVSR.LT.1 .OR. ( ILVSR .AND. LDVSR.LT.N ) ) THEN INFO = -16 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -18 END IF * @@ -377,28 +380,32 @@ * IF( INFO.EQ.0 ) THEN CALL ZGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( 1, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, N + INT( WORK( 1 ) ) ) CALL ZUNMQR( 'L', 'C', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) - LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) ) IF( ILVSL ) THEN CALL ZUNGQR( N, N, N, VSL, LDVSL, WORK, WORK, -1, IERR ) LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) ) END IF CALL ZGGHD3( JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, VSL, $ LDVSL, VSR, LDVSR, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) ) CALL ZLAQZ0( 'S', JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, $ ALPHA, BETA, VSL, LDVSL, VSR, LDVSR, WORK, -1, $ RWORK, 0, IERR ) - LWKOPT = MAX( LWKOPT, INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) ) IF( WANTST ) THEN CALL ZTGSEN( 0, ILVSL, ILVSR, BWORK, N, A, LDA, B, LDB, $ ALPHA, BETA, VSL, LDVSL, VSR, LDVSR, SDIM, $ PVSL, PVSR, DIF, WORK, -1, IDUM, 1, IERR ) - LWKOPT = MAX( LWKOPT, INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) ) + END IF + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = DCMPLX( LWKOPT ) END IF - WORK( 1 ) = DCMPLX( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -420,7 +427,6 @@ EPS = DLAMCH( 'P' ) SMLNUM = DLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * diff --git a/lapack-netlib/SRC/zggev3.f b/lapack-netlib/SRC/zggev3.f index 2d6c74582..0cc073470 100644 --- a/lapack-netlib/SRC/zggev3.f +++ b/lapack-netlib/SRC/zggev3.f @@ -174,7 +174,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= MAX(1,2*N). +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -208,7 +209,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEeigen +*> \ingroup ggev3 * * ===================================================================== SUBROUTINE ZGGEV3( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHA, BETA, @@ -243,7 +244,7 @@ CHARACTER CHTEMP INTEGER ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, ILO, $ IN, IRIGHT, IROWS, IRWRK, ITAU, IWRK, JC, JR, - $ LWKOPT + $ LWKMIN, LWKOPT DOUBLE PRECISION ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, $ SMLNUM, TEMP COMPLEX*16 X @@ -252,9 +253,8 @@ LOGICAL LDUMMA( 1 ) * .. * .. External Subroutines .. - EXTERNAL DLABAD, XERBLA, ZGEQRF, ZGGBAK, ZGGBAL, ZGGHD3, - $ ZLAQZ0, ZLACPY, ZLASCL, ZLASET, ZTGEVC, ZUNGQR, - $ ZUNMQR + EXTERNAL XERBLA, ZGEQRF, ZGGBAK, ZGGBAL, ZGGHD3, ZLAQZ0, + $ ZLACPY, ZLASCL, ZLASET, ZTGEVC, ZUNGQR, ZUNMQR * .. * .. External Functions .. LOGICAL LSAME @@ -301,6 +301,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N ) IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -315,7 +316,7 @@ INFO = -11 ELSE IF( LDVR.LT.1 .OR. ( ILVR .AND. LDVR.LT.N ) ) THEN INFO = -13 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -15 END IF * @@ -323,7 +324,7 @@ * IF( INFO.EQ.0 ) THEN CALL ZGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( 1, N+INT( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, N+INT( WORK( 1 ) ) ) CALL ZUNMQR( 'L', 'C', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) @@ -348,7 +349,11 @@ $ RWORK, 0, IERR ) LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) END IF - WORK( 1 ) = DCMPLX( LWKOPT ) + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = DCMPLX( LWKOPT ) + END IF END IF * IF( INFO.NE.0 ) THEN @@ -368,7 +373,6 @@ EPS = DLAMCH( 'E' )*DLAMCH( 'B' ) SMLNUM = DLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * diff --git a/lapack-netlib/SRC/zgghd3.f b/lapack-netlib/SRC/zgghd3.f index b29cdc70a..08343688d 100644 --- a/lapack-netlib/SRC/zgghd3.f +++ b/lapack-netlib/SRC/zgghd3.f @@ -176,14 +176,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension (LWORK) +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= 1. +*> The length of the array WORK. LWORK >= 1. *> For optimum performance LWORK >= 6*N*NB, where NB is the *> optimal blocksize. *> @@ -208,7 +208,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERcomputational +*> \ingroup gghd3 * *> \par Further Details: * ===================== @@ -275,7 +275,12 @@ * INFO = 0 NB = ILAENV( 1, 'ZGGHD3', ' ', N, ILO, IHI, -1 ) - LWKOPT = MAX( 6*N*NB, 1 ) + NH = IHI - ILO + 1 + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = 6*N*NB + END IF WORK( 1 ) = DCMPLX( LWKOPT ) INITQ = LSAME( COMPQ, 'I' ) WANTQ = INITQ .OR. LSAME( COMPQ, 'V' ) @@ -325,7 +330,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = CONE RETURN @@ -883,6 +887,7 @@ IF ( JCOL.LT.IHI ) $ CALL ZGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q, $ LDQ, Z, LDZ, IERR ) +* WORK( 1 ) = DCMPLX( LWKOPT ) * RETURN diff --git a/lapack-netlib/SRC/zggqrf.f b/lapack-netlib/SRC/zggqrf.f index 0388b0874..d8636d663 100644 --- a/lapack-netlib/SRC/zggqrf.f +++ b/lapack-netlib/SRC/zggqrf.f @@ -173,7 +173,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERcomputational +*> \ingroup ggqrf * *> \par Further Details: * ===================== @@ -250,7 +250,7 @@ NB2 = ILAENV( 1, 'ZGERQF', ' ', N, P, -1, -1 ) NB3 = ILAENV( 1, 'ZUNMQR', ' ', N, M, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P )*NB + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN diff --git a/lapack-netlib/SRC/zggrqf.f b/lapack-netlib/SRC/zggrqf.f index be912c772..69c14af24 100644 --- a/lapack-netlib/SRC/zggrqf.f +++ b/lapack-netlib/SRC/zggrqf.f @@ -172,7 +172,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERcomputational +*> \ingroup ggrqf * *> \par Further Details: * ===================== @@ -249,7 +249,7 @@ NB2 = ILAENV( 1, 'ZGEQRF', ' ', P, N, -1, -1 ) NB3 = ILAENV( 1, 'ZUNMRQ', ' ', M, N, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P )*NB + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN diff --git a/lapack-netlib/SRC/zggsvd3.f b/lapack-netlib/SRC/zggsvd3.f index 71257a7c0..40624f5be 100644 --- a/lapack-netlib/SRC/zggsvd3.f +++ b/lapack-netlib/SRC/zggsvd3.f @@ -277,7 +277,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -332,7 +332,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEsing +*> \ingroup ggsvd3 * *> \par Contributors: * ================== diff --git a/lapack-netlib/SRC/zggsvp3.f b/lapack-netlib/SRC/zggsvp3.f index f39ccdad3..7b465aaee 100644 --- a/lapack-netlib/SRC/zggsvp3.f +++ b/lapack-netlib/SRC/zggsvp3.f @@ -233,7 +233,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -256,7 +256,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERcomputational +*> \ingroup ggsvp3 * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/zheevd.f b/lapack-netlib/SRC/zheevd.f index ba52f9e72..8e86b9e88 100644 --- a/lapack-netlib/SRC/zheevd.f +++ b/lapack-netlib/SRC/zheevd.f @@ -116,8 +116,7 @@ *> *> \param[out] RWORK *> \verbatim -*> RWORK is DOUBLE PRECISION array, -*> dimension (LRWORK) +*> RWORK is DOUBLE PRECISION array, dimension (MAX(1,LRWORK)) *> On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK. *> \endverbatim *> @@ -180,7 +179,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEeigen +*> \ingroup heevd * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/zheevr.f b/lapack-netlib/SRC/zheevr.f index 1452e04a3..fe6e1a85f 100644 --- a/lapack-netlib/SRC/zheevr.f +++ b/lapack-netlib/SRC/zheevr.f @@ -272,7 +272,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= max(1,2*N). +*> The length of the array WORK. +*> If N <= 1, LWORK >= 1, else LWORK >= 2*N. *> For optimal efficiency, LWORK >= (NB+1)*N, *> where NB is the max of the blocksize for ZHETRD and for *> ZUNMTR as returned by ILAENV. @@ -294,7 +295,8 @@ *> \param[in] LRWORK *> \verbatim *> LRWORK is INTEGER -*> The length of the array RWORK. LRWORK >= max(1,24*N). +*> The length of the array RWORK. +*> If N <= 1, LRWORK >= 1, else LRWORK >= 24*N. *> *> If LRWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -313,7 +315,8 @@ *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -338,7 +341,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEeigen +*> \ingroup heevr * *> \par Contributors: * ================== @@ -417,9 +420,15 @@ LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 ) .OR. $ ( LIWORK.EQ.-1 ) ) * - LRWMIN = MAX( 1, 24*N ) - LIWMIN = MAX( 1, 10*N ) - LWMIN = MAX( 1, 2*N ) + IF( N.LE.1 ) THEN + LWMIN = 1 + LRWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = 2*N + LRWMIN = 24*N + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN @@ -454,7 +463,7 @@ NB = ILAENV( 1, 'ZHETRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'ZUNMTR', UPLO, N, -1, -1, -1 ) ) LWKOPT = MAX( ( NB+1 )*N, LWMIN ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = LWKOPT RWORK( 1 ) = LRWMIN IWORK( 1 ) = LIWMIN * @@ -483,7 +492,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 2 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = DBLE( A( 1, 1 ) ) @@ -710,7 +719,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWKOPT + WORK( 1 ) = LWKOPT RWORK( 1 ) = LRWMIN IWORK( 1 ) = LIWMIN * diff --git a/lapack-netlib/SRC/zheevr_2stage.f b/lapack-netlib/SRC/zheevr_2stage.f index 5c576e633..b1cc7175f 100644 --- a/lapack-netlib/SRC/zheevr_2stage.f +++ b/lapack-netlib/SRC/zheevr_2stage.f @@ -265,7 +265,7 @@ *> indicating the nonzero elements in Z. The i-th eigenvector *> is nonzero only in elements ISUPPZ( 2*i-1 ) through *> ISUPPZ( 2*i ). This is an output of ZSTEMR (tridiagonal -*> matrix). The support of the eigenvectors of A is typically +*> matrix). The support of the eigenvectors of A is typically *> 1:N because of the unitary transformations applied by ZUNMTR. *> Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1 *> \endverbatim @@ -279,12 +279,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. +*> If N <= 1, LWORK must be at least 1. *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, 26*N, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -310,7 +311,8 @@ *> \param[in] LRWORK *> \verbatim *> LRWORK is INTEGER -*> The length of the array RWORK. LRWORK >= max(1,24*N). +*> The length of the array RWORK. +*> If N <= 1, LRWORK >= 1, else LRWORK >= 24*N. *> *> If LRWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -329,7 +331,8 @@ *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -354,7 +357,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEeigen +*> \ingroup heevr_2stage * *> \par Contributors: * ================== @@ -382,7 +385,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -390,11 +393,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -472,9 +475,16 @@ IB = ILAENV2STAGE( 2, 'ZHETRD_2STAGE', JOBZ, N, KD, -1, -1 ) LHTRD = ILAENV2STAGE( 3, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) LWTRD = ILAENV2STAGE( 4, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) - LWMIN = N + LHTRD + LWTRD - LRWMIN = MAX( 1, 24*N ) - LIWMIN = MAX( 1, 10*N ) +* + IF( N.LE.1 ) THEN + LWMIN = 1 + LRWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = N + LHTRD + LWTRD + LRWMIN = 24*N + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN @@ -535,7 +545,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 2 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = DBLE( A( 1, 1 ) ) @@ -643,9 +653,9 @@ * * Call ZHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form. * - CALL ZHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ), + CALL ZHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ), $ RWORK( INDRE ), WORK( INDTAU ), - $ WORK( INDHOUS ), LHTRD, + $ WORK( INDHOUS ), LHTRD, $ WORK( INDWK ), LLWORK, IINFO ) * * If all eigenvalues are desired diff --git a/lapack-netlib/SRC/zhesv_aa.f b/lapack-netlib/SRC/zhesv_aa.f index df8498c7a..b3d4b3725 100644 --- a/lapack-netlib/SRC/zhesv_aa.f +++ b/lapack-netlib/SRC/zhesv_aa.f @@ -128,7 +128,7 @@ *> LWORK is INTEGER *> The length of WORK. LWORK >= MAX(1,2*N,3*N-2), and for best *> performance LWORK >= max(1,N*NB), where NB is the optimal -*> blocksize for ZHETRF. +*> blocksize for ZHETRF_AA. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -154,7 +154,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEsolve +*> \ingroup hesv_aa * * ===================================================================== SUBROUTINE ZHESV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, @@ -177,7 +177,7 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER LWKOPT, LWKOPT_HETRF, LWKOPT_HETRS + INTEGER LWKMIN, LWKOPT, LWKOPT_HETRF, LWKOPT_HETRS * .. * .. External Functions .. LOGICAL LSAME @@ -196,6 +196,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N, 3*N-2 ) IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -206,17 +207,17 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX(2*N, 3*N-2) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF * IF( INFO.EQ.0 ) THEN CALL ZHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT_HETRF = INT( WORK(1) ) + LWKOPT_HETRF = INT( WORK( 1 ) ) CALL ZHETRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, $ -1, INFO ) - LWKOPT_HETRS = INT( WORK(1) ) - LWKOPT = MAX( LWKOPT_HETRF, LWKOPT_HETRS ) + LWKOPT_HETRS = INT( WORK( 1 ) ) + LWKOPT = MAX( LWKMIN, LWKOPT_HETRF, LWKOPT_HETRS ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/zhesv_aa_2stage.f b/lapack-netlib/SRC/zhesv_aa_2stage.f index 79c01c546..c503b5554 100644 --- a/lapack-netlib/SRC/zhesv_aa_2stage.f +++ b/lapack-netlib/SRC/zhesv_aa_2stage.f @@ -100,14 +100,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is COMPLEX*16 array, dimension (LTB) +*> TB is COMPLEX*16 array, dimension (MAX(1,LTB)). *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -147,14 +147,15 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 workspace of size LWORK +*> WORK is COMPLEX*16 workspace of size (MAX(1,LWORK)). +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used to +*> select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -178,7 +179,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEsolve +*> \ingroup hesv_aa_2stage * * ===================================================================== SUBROUTINE ZHESV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB, @@ -208,7 +209,7 @@ * * .. Local Scalars .. LOGICAL UPPER, TQUERY, WQUERY - INTEGER LWKOPT + INTEGER LWKOPT, LWKMIN * .. * .. External Functions .. LOGICAL LSAME @@ -229,6 +230,7 @@ UPPER = LSAME( UPLO, 'U' ) WQUERY = ( LWORK.EQ.-1 ) TQUERY = ( LTB.EQ.-1 ) + LWKMIN = MAX( 1, N ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -237,18 +239,19 @@ INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 - ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 - ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.WQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL ZHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) - LWKOPT = INT( WORK(1) ) + LWKOPT = MAX( LWKMIN, INT( WORK( 1 ) ) ) + WORK( 1 ) = LWKOPT END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/zhesvx.f b/lapack-netlib/SRC/zhesvx.f index 485c81df6..64aa16674 100644 --- a/lapack-netlib/SRC/zhesvx.f +++ b/lapack-netlib/SRC/zhesvx.f @@ -234,8 +234,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= max(1,2*N), and for best -*> performance, when FACT = 'N', LWORK >= max(1,2*N,N*NB), where +*> The length of WORK. LWORK >= MAX(1,2*N), and for best +*> performance, when FACT = 'N', LWORK >= MAX(1,2*N,N*NB), where *> NB is the optimal blocksize for ZHETRF. *> *> If LWORK = -1, then a workspace query is assumed; the routine @@ -276,7 +276,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEsolve +*> \ingroup hesvx * * ===================================================================== SUBROUTINE ZHESVX( FACT, UPLO, N, NRHS, A, LDA, AF, LDAF, IPIV, B, @@ -307,7 +307,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, NOFACT - INTEGER LWKOPT, NB + INTEGER LWKOPT, LWKMIN, NB DOUBLE PRECISION ANORM * .. * .. External Functions .. @@ -329,6 +329,7 @@ INFO = 0 NOFACT = LSAME( FACT, 'N' ) LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N ) IF( .NOT.NOFACT .AND. .NOT.LSAME( FACT, 'F' ) ) THEN INFO = -1 ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) @@ -346,12 +347,12 @@ INFO = -11 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -13 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -18 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = MAX( 1, 2*N ) + LWKOPT = LWKMIN IF( NOFACT ) THEN NB = ILAENV( 1, 'ZHETRF', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( LWKOPT, N*NB ) diff --git a/lapack-netlib/SRC/zhetrd_2stage.f b/lapack-netlib/SRC/zhetrd_2stage.f index b9d2f0eb1..ab444894b 100644 --- a/lapack-netlib/SRC/zhetrd_2stage.f +++ b/lapack-netlib/SRC/zhetrd_2stage.f @@ -4,23 +4,23 @@ * * =========== DOCUMENTATION =========== * -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ * *> \htmlonly -*> Download ZHETRD_2STAGE + dependencies -*> -*> [TGZ] -*> -*> [ZIP] -*> +*> Download ZHETRD_2STAGE + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> *> [TXT] -*> \endhtmlonly +*> \endhtmlonly * * Definition: * =========== * -* SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, +* SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, * HOUS2, LHOUS2, WORK, LWORK, INFO ) * * IMPLICIT NONE @@ -34,7 +34,7 @@ * COMPLEX*16 A( LDA, * ), TAU( * ), * HOUS2( * ), WORK( * ) * .. -* +* * *> \par Purpose: * ============= @@ -52,11 +52,11 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> in particular for the second stage (Band to *> tridiagonal) and thus LHOUS2 is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate Q1 Q2 or to apply Q1 Q2, +*> = 'V': the Householder representation is needed to +*> either generate Q1 Q2 or to apply Q1 Q2, *> then LHOUS2 is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -86,7 +86,7 @@ *> triangular part of A is not referenced. *> On exit, if UPLO = 'U', the band superdiagonal *> of A are overwritten by the corresponding elements of the -*> internal band-diagonal matrix AB, and the elements above +*> internal band-diagonal matrix AB, and the elements above *> the KD superdiagonal, with the array TAU, represent the unitary *> matrix Q1 as a product of elementary reflectors; if UPLO *> = 'L', the diagonal and band subdiagonal of A are over- @@ -117,13 +117,13 @@ *> \param[out] TAU *> \verbatim *> TAU is COMPLEX*16 array, dimension (N-KD) -*> The scalar factors of the elementary reflectors of +*> The scalar factors of the elementary reflectors of *> the first stage (see Further Details). *> \endverbatim *> *> \param[out] HOUS2 *> \verbatim -*> HOUS2 is COMPLEX*16 array, dimension (LHOUS2) +*> HOUS2 is COMPLEX*16 array, dimension (MAX(1,LHOUS2)) *> Stores the Householder representation of the stage2 *> band to tridiagonal. *> \endverbatim @@ -132,6 +132,8 @@ *> \verbatim *> LHOUS2 is INTEGER *> The dimension of the array HOUS2. +*> LHOUS2 >= 1. +*> *> If LWORK = -1, or LHOUS2 = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS2 array, returns @@ -143,23 +145,26 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension (LWORK) +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS2=-1, +*> The dimension of the array WORK. +*> If N = 0, LWORK >= 1, else LWORK = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS2 = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> LWORK = MAX(1, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) -*> + (KD+1)*N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ *> algorithm, usually FACTOPTNB=128 is a good choice @@ -177,12 +182,12 @@ * Authors: * ======== * -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrd_2stage * *> \par Further Details: * ===================== @@ -202,7 +207,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -210,16 +215,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, + SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, $ HOUS2, LHOUS2, WORK, LWORK, INFO ) * IMPLICIT NONE @@ -265,10 +270,13 @@ * KD = ILAENV2STAGE( 1, 'ZHETRD_2STAGE', VECT, N, -1, -1, -1 ) IB = ILAENV2STAGE( 2, 'ZHETRD_2STAGE', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 ) -* WRITE(*,*),'ZHETRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO, -* $ LHMIN, LWMIN + IF( N.EQ.0 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.LSAME( VECT, 'N' ) ) THEN INFO = -1 @@ -309,14 +317,14 @@ LWRK = LWORK-LDAB*N ABPOS = 1 WPOS = ABPOS + LDAB*N - CALL ZHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, + CALL ZHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, $ TAU, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZHETRD_HE2HB', -INFO ) RETURN END IF - CALL ZHETRD_HB2ST( 'Y', VECT, UPLO, N, KD, - $ WORK( ABPOS ), LDAB, D, E, + CALL ZHETRD_HB2ST( 'Y', VECT, UPLO, N, KD, + $ WORK( ABPOS ), LDAB, D, E, $ HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZHETRD_HB2ST', -INFO ) @@ -324,7 +332,6 @@ END IF * * - HOUS2( 1 ) = LHMIN WORK( 1 ) = LWMIN RETURN * diff --git a/lapack-netlib/SRC/zhetrd_hb2st.F b/lapack-netlib/SRC/zhetrd_hb2st.F index 1d39ac942..247497ab6 100644 --- a/lapack-netlib/SRC/zhetrd_hb2st.F +++ b/lapack-netlib/SRC/zhetrd_hb2st.F @@ -18,7 +18,7 @@ * Definition: * =========== * -* SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, +* SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, * D, E, HOUS, LHOUS, WORK, LWORK, INFO ) * * #if defined(_OPENMP) @@ -53,12 +53,12 @@ *> \param[in] STAGE1 *> \verbatim *> STAGE1 is CHARACTER*1 -*> = 'N': "No": to mention that the stage 1 of the reduction +*> = 'N': "No": to mention that the stage 1 of the reduction *> from dense to band using the zhetrd_he2hb routine -*> was not called before this routine to reproduce AB. -*> In other term this routine is called as standalone. -*> = 'Y': "Yes": to mention that the stage 1 of the -*> reduction from dense to band using the zhetrd_he2hb +*> was not called before this routine to reproduce AB. +*> In other term this routine is called as standalone. +*> = 'Y': "Yes": to mention that the stage 1 of the +*> reduction from dense to band using the zhetrd_he2hb *> routine has been called to produce AB (e.g., AB is *> the output of zhetrd_he2hb. *> \endverbatim @@ -66,10 +66,10 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> and thus LHOUS is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate or to apply Q later on, +*> = 'V': the Householder representation is needed to +*> either generate or to apply Q later on, *> then LHOUS is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -132,34 +132,39 @@ *> *> \param[out] HOUS *> \verbatim -*> HOUS is COMPLEX*16 array, dimension LHOUS, that -*> store the Householder representation. +*> HOUS is COMPLEX*16 array, dimension (MAX(1,LHOUS)) +*> Stores the Householder representation. *> \endverbatim *> *> \param[in] LHOUS *> \verbatim *> LHOUS is INTEGER -*> The dimension of the array HOUS. LHOUS = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array HOUS. +*> If N = 0 or KD <= 1, LHOUS >= 1, else LHOUS = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS array, returns *> this value as the first entry of the HOUS array, and no error *> message related to LHOUS is issued by XERBLA. *> LHOUS = MAX(1, dimension) where *> dimension = 4*N if VECT='N' -*> not available now if VECT='H' +*> not available now if VECT='H' *> \endverbatim *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension LWORK. +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)). +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array WORK. +*> If N = 0 or KD <= 1, LWORK >= 1, else LWORK = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -188,7 +193,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERcomputational +*> \ingroup hetrd_hb2st * *> \par Further Details: * ===================== @@ -208,7 +213,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -216,16 +221,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, + SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, $ D, E, HOUS, LHOUS, WORK, LWORK, INFO ) * * @@ -259,11 +264,11 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, WANTQ, UPPER, AFTERS1 - INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, + INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, $ ED, STIND, EDIND, BLKLASTIND, COLPT, THED, $ STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID, - $ NBTILES, TTYPE, TID, NTHREADS, DEBUG, - $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, + $ NBTILES, TTYPE, TID, NTHREADS, + $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, $ INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU, $ SIZEV, SIZETAU, LDV, LHMIN, LWMIN DOUBLE PRECISION ABSTMP @@ -277,7 +282,7 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV2STAGE + INTEGER ILAENV2STAGE EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. @@ -285,7 +290,6 @@ * Determine the minimal workspace size required. * Test the input parameters * - DEBUG = 0 INFO = 0 AFTERS1 = LSAME( STAGE1, 'Y' ) WANTQ = LSAME( VECT, 'V' ) @@ -294,9 +298,14 @@ * * Determine the block size, the workspace size and the hous size. * - IB = ILAENV2STAGE( 2, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) + IB = ILAENV2STAGE( 2, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 ) + IF( N.EQ.0 .OR. KD.LE.1 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN INFO = -1 @@ -358,7 +367,7 @@ ABDPOS = KD + 1 ABOFDPOS = KD ELSE - APOS = INDA + APOS = INDA AWPOS = INDA + KD + 1 DPOS = APOS OFDPOS = DPOS + 1 @@ -366,11 +375,11 @@ ABOFDPOS = 2 ENDIF -* -* Case KD=0: -* The matrix is diagonal. We just copy it (convert to "real" for -* complex because D is double and the imaginary part should be 0) -* and store it in D. A sequential code here is better or +* +* Case KD=0: +* The matrix is diagonal. We just copy it (convert to "real" for +* complex because D is double and the imaginary part should be 0) +* and store it in D. A sequential code here is better or * in a parallel environment it might need two cores for D and E * IF( KD.EQ.0 ) THEN @@ -385,17 +394,17 @@ WORK( 1 ) = 1 RETURN END IF -* -* Case KD=1: -* The matrix is already Tridiagonal. We have to make diagonal +* +* Case KD=1: +* The matrix is already Tridiagonal. We have to make diagonal * and offdiagonal elements real, and store them in D and E. -* For that, for real precision just copy the diag and offdiag -* to D and E while for the COMPLEX case the bulge chasing is -* performed to convert the hermetian tridiagonal to symmetric -* tridiagonal. A simpler conversion formula might be used, but then +* For that, for real precision just copy the diag and offdiag +* to D and E while for the COMPLEX case the bulge chasing is +* performed to convert the hermetian tridiagonal to symmetric +* tridiagonal. A simpler conversion formula might be used, but then * updating the Q matrix will be required and based if Q is generated -* or not this might complicate the story. -* +* or not this might complicate the story. +* IF( KD.EQ.1 ) THEN DO 50 I = 1, N D( I ) = DBLE( AB( ABDPOS, I ) ) @@ -444,7 +453,7 @@ C END IF RETURN END IF * -* Main code start here. +* Main code start here. * Reduce the hermitian band of A to a tridiagonal matrix. * THGRSIZ = N @@ -453,7 +462,7 @@ C END IF NBTILES = CEILING( REAL(N)/REAL(KD) ) STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) ) THGRNB = CEILING( REAL(N-1)/REAL(THGRSIZ) ) -* +* CALL ZLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA ) CALL ZLASET( "A", KD, N, ZERO, ZERO, WORK( AWPOS ), LDA ) * @@ -462,7 +471,7 @@ C END IF * #if defined(_OPENMP) !$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND ) -!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) +!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) !$OMP$ PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND ) !$OMP$ SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK) !$OMP$ SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA ) @@ -471,7 +480,7 @@ C END IF #endif * * main bulge chasing loop -* +* DO 100 THGRID = 1, THGRNB STT = (THGRID-1)*THGRSIZ+1 THED = MIN( (STT + THGRSIZ -1), (N-1)) @@ -482,7 +491,7 @@ C END IF ST = STT DO 130 SWEEPID = ST, ED DO 140 K = 1, GRSIZ - MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ) + MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ) $ + (M-1)*GRSIZ + K IF ( MYID.EQ.1 ) THEN TTYPE = 1 @@ -508,17 +517,17 @@ C END IF ENDIF * * Call the kernel -* +* #if defined(_OPENMP) && _OPENMP >= 201307 - IF( TTYPE.NE.1 ) THEN + IF( TTYPE.NE.1 ) THEN !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(in:WORK(MYID-1)) !$OMP$ DEPEND(out:WORK(MYID)) TID = OMP_GET_THREAD_NUM() - CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW + TID*KD ) ) !$OMP END TASK @@ -526,20 +535,20 @@ C END IF !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(out:WORK(MYID)) TID = OMP_GET_THREAD_NUM() - CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW + TID*KD ) ) !$OMP END TASK ENDIF #else - CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW ) ) -#endif +#endif IF ( BLKLASTIND.GE.(N-1) ) THEN STT = STT + 1 EXIT @@ -554,14 +563,14 @@ C END IF !$OMP END MASTER !$OMP END PARALLEL #endif -* +* * Copy the diagonal from A to D. Note that D is REAL thus only * the Real part is needed, the imaginary part should be zero. * DO 150 I = 1, N D( I ) = DBLE( WORK( DPOS+(I-1)*LDA ) ) 150 CONTINUE -* +* * Copy the off diagonal from A to E. Note that E is REAL thus only * the Real part is needed, the imaginary part should be zero. * @@ -575,11 +584,10 @@ C END IF 170 CONTINUE ENDIF * - HOUS( 1 ) = LHMIN WORK( 1 ) = LWMIN RETURN * * End of ZHETRD_HB2ST * END - + diff --git a/lapack-netlib/SRC/zhetrd_he2hb.f b/lapack-netlib/SRC/zhetrd_he2hb.f index e1b2e1794..3e3bfa374 100644 --- a/lapack-netlib/SRC/zhetrd_he2hb.f +++ b/lapack-netlib/SRC/zhetrd_he2hb.f @@ -123,8 +123,8 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension (LWORK) -*> On exit, if INFO = 0, or if LWORK=-1, +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, or if LWORK = -1, *> WORK(1) returns the size of LWORK. *> \endverbatim *> @@ -132,7 +132,9 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK which should be calculated -*> by a workspace query. LWORK = MAX(1, LWORK_QUERY) +*> by a workspace query. +*> If N <= KD+1, LWORK >= 1, else LWORK = MAX(1, LWORK_QUERY). +*> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -158,7 +160,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrd_he2hb * *> \par Further Details: * ===================== @@ -293,8 +295,12 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) - LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HE2HB', '', N, KD, -1, -1 ) - + IF( N.LE.KD+1 ) THEN + LWMIN = 1 + ELSE + LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HE2HB', '', N, KD, -1, -1 ) + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN diff --git a/lapack-netlib/SRC/zhetrf.f b/lapack-netlib/SRC/zhetrf.f index 78d4f71b8..433887108 100644 --- a/lapack-netlib/SRC/zhetrf.f +++ b/lapack-netlib/SRC/zhetrf.f @@ -107,7 +107,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> \endverbatim *> @@ -130,7 +130,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrf * *> \par Further Details: * ===================== @@ -227,7 +227,7 @@ * Determine the block size * NB = ILAENV( 1, 'ZHETRF', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * @@ -346,6 +346,7 @@ END IF * 40 CONTINUE +* WORK( 1 ) = LWKOPT RETURN * diff --git a/lapack-netlib/SRC/zhetrf_aa.f b/lapack-netlib/SRC/zhetrf_aa.f index 537c16e8c..381c87d51 100644 --- a/lapack-netlib/SRC/zhetrf_aa.f +++ b/lapack-netlib/SRC/zhetrf_aa.f @@ -101,8 +101,10 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= MAX(1,2*N). For optimum performance -*> LWORK >= N*(1+NB), where NB is the optimal blocksize. +*> The length of WORK. +*> LWORK >= 1, if N >= 1, and LWORK >= 2*N, otherwise. +*> For optimum performance LWORK >= N*(1+NB), where NB is +*> the optimal blocksize, returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -125,10 +127,10 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrf_aa * * ===================================================================== - SUBROUTINE ZHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO) + SUBROUTINE ZHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -152,7 +154,7 @@ * * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER J, LWKOPT + INTEGER J, LWKMIN, LWKOPT INTEGER NB, MJ, NJ, K1, K2, J1, J2, J3, JB COMPLEX*16 ALPHA * .. @@ -178,18 +180,25 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( N.LE.1 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = 2*N + LWKOPT = (NB+1)*N + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = (NB+1)*N WORK( 1 ) = LWKOPT END IF * @@ -202,11 +211,11 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF IPIV( 1 ) = 1 - IF ( N.EQ.1 ) THEN + IF( N.EQ.1 ) THEN A( 1, 1 ) = DBLE( A( 1, 1 ) ) RETURN END IF diff --git a/lapack-netlib/SRC/zhetrf_aa_2stage.f b/lapack-netlib/SRC/zhetrf_aa_2stage.f index 477602b5e..bab13a99d 100644 --- a/lapack-netlib/SRC/zhetrf_aa_2stage.f +++ b/lapack-netlib/SRC/zhetrf_aa_2stage.f @@ -87,14 +87,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is COMPLEX*16 array, dimension (LTB) +*> TB is COMPLEX*16 array, dimension (MAX(1,LTB)) *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -121,14 +121,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 workspace of size LWORK +*> WORK is COMPLEX*16 workspace of size (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used to +*> select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -152,7 +152,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16SYcomputational +*> \ingroup hetrf_aa_2stage * * ===================================================================== SUBROUTINE ZHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, @@ -182,7 +182,7 @@ * .. Local Scalars .. LOGICAL UPPER, TQUERY, WQUERY INTEGER I, J, K, I1, I2, TD - INTEGER LDTB, NB, KB, JB, NT, IINFO + INTEGER LWKOPT, LDTB, NB, KB, JB, NT, IINFO COMPLEX*16 PIV * .. * .. External Functions .. @@ -212,9 +212,9 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LTB .LT. 4*N .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -6 - ELSE IF ( LWORK .LT. N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.WQUERY ) THEN INFO = -10 END IF * @@ -228,10 +228,10 @@ NB = ILAENV( 1, 'ZHETRF_AA_2STAGE', UPLO, N, -1, -1, -1 ) IF( INFO.EQ.0 ) THEN IF( TQUERY ) THEN - TB( 1 ) = (3*NB+1)*N + TB( 1 ) = MAX( 1, (3*NB+1)*N ) END IF IF( WQUERY ) THEN - WORK( 1 ) = N*NB + WORK( 1 ) = MAX( 1, N*NB ) END IF END IF IF( TQUERY .OR. WQUERY ) THEN @@ -240,7 +240,7 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF * @@ -392,7 +392,7 @@ CALL ZGETRF( N-(J+1)*NB, NB, $ WORK, N, $ IPIV( (J+1)*NB+1 ), IINFO ) -c IF (IINFO.NE.0 .AND. INFO.EQ.0) THEN +c IF( IINFO.NE.0 .AND. INFO.EQ.0 ) THEN c INFO = IINFO+(J+1)*NB c END IF * @@ -587,7 +587,7 @@ c END IF CALL ZGETRF( N-(J+1)*NB, NB, $ A( (J+1)*NB+1, J*NB+1 ), LDA, $ IPIV( (J+1)*NB+1 ), IINFO ) -c IF (IINFO.NE.0 .AND. INFO.EQ.0) THEN +c IF( IINFO.NE.0 .AND. INFO.EQ.0 ) THEN c INFO = IINFO+(J+1)*NB c END IF * diff --git a/lapack-netlib/SRC/zhetrf_rk.f b/lapack-netlib/SRC/zhetrf_rk.f index 73dd9f9d0..7c505fa4d 100644 --- a/lapack-netlib/SRC/zhetrf_rk.f +++ b/lapack-netlib/SRC/zhetrf_rk.f @@ -177,14 +177,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension ( MAX(1,LWORK) ). +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned *> by ILAENV. *> @@ -229,7 +229,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrf_rk * *> \par Further Details: * ===================== @@ -310,7 +310,7 @@ * Determine the block size * NB = ILAENV( 1, 'ZHETRF_RK', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/zhetrf_rook.f b/lapack-netlib/SRC/zhetrf_rook.f index e9de47248..a56349092 100644 --- a/lapack-netlib/SRC/zhetrf_rook.f +++ b/lapack-netlib/SRC/zhetrf_rook.f @@ -122,7 +122,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine @@ -150,7 +150,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrf_rook * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/zhetri2.f b/lapack-netlib/SRC/zhetri2.f index 384745c3a..1d932b866 100644 --- a/lapack-netlib/SRC/zhetri2.f +++ b/lapack-netlib/SRC/zhetri2.f @@ -88,16 +88,16 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension (N+NB+1)*(NB+3) +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)). *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> WORK is size >= (N+NB+1)*(NB+3) +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> If LWORK = -1, then a workspace query is assumed; the routine -*> calculates: +*> calculates: *> - the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, *> - and no error message related to LWORK is issued by XERBLA. @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetri2 * * ===================================================================== SUBROUTINE ZHETRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -159,9 +159,13 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) +* * Get blocksize +* NBMAX = ILAENV( 1, 'ZHETRF', UPLO, N, -1, -1, -1 ) - IF ( NBMAX .GE. N ) THEN + IF( N.EQ.0 ) THEN + MINSIZE = 1 + ELSE IF( NBMAX.GE.N ) THEN MINSIZE = N ELSE MINSIZE = (N+NBMAX+1)*(NBMAX+3) @@ -173,28 +177,29 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF (LWORK .LT. MINSIZE .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.MINSIZE .AND. .NOT.LQUERY ) THEN INFO = -7 END IF -* -* Quick return if possible -* * IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZHETRI2', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK(1)=MINSIZE + WORK( 1 ) = MINSIZE RETURN END IF +* +* Quick return if possible +* IF( N.EQ.0 ) $ RETURN - IF( NBMAX .GE. N ) THEN + IF( NBMAX.GE.N ) THEN CALL ZHETRI( UPLO, N, A, LDA, IPIV, WORK, INFO ) ELSE CALL ZHETRI2X( UPLO, N, A, LDA, IPIV, WORK, NBMAX, INFO ) END IF +* RETURN * * End of ZHETRI2 diff --git a/lapack-netlib/SRC/zhetrs_aa.f b/lapack-netlib/SRC/zhetrs_aa.f index 06ac1fd28..b7a1f7f07 100644 --- a/lapack-netlib/SRC/zhetrs_aa.f +++ b/lapack-netlib/SRC/zhetrs_aa.f @@ -106,7 +106,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,3*N-2). +*> The dimension of the array WORK. +*> If MIN(N,NRHS) = 0, LWORK >= 1, else LWORK >= 3*N-2. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the minimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. *> \endverbatim *> *> \param[out] INFO @@ -124,7 +130,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrs_aa * * ===================================================================== SUBROUTINE ZHETRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, @@ -152,7 +158,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER K, KP, LWKOPT + INTEGER K, KP, LWKMIN * .. * .. External Functions .. LOGICAL LSAME @@ -162,13 +168,19 @@ EXTERNAL ZGTSV, ZSWAP, ZTRSM, ZLACGV, ZLACPY, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX + INTRINSIC MIN, MAX * .. * .. Executable Statements .. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWKMIN = 1 + ELSE + LWKMIN = 3*N-2 + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -179,21 +191,20 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX( 1, 3*N-2 ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZHETRS_AA', -INFO ) RETURN ELSE IF( LQUERY ) THEN - LWKOPT = (3*N-2) - WORK( 1 ) = LWKOPT + WORK( 1 ) = LWKMIN RETURN END IF * * Quick return if possible * - IF( N.EQ.0 .OR. NRHS.EQ.0 ) + IF( MIN( N, NRHS ).EQ.0 ) $ RETURN * IF( UPPER ) THEN diff --git a/lapack-netlib/SRC/zlamswlq.f b/lapack-netlib/SRC/zlamswlq.f index 4abefa434..59a0a5558 100644 --- a/lapack-netlib/SRC/zlamswlq.f +++ b/lapack-netlib/SRC/zlamswlq.f @@ -127,17 +127,20 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> If SIDE = 'L', LWORK >= max(1,NB) * MB; -*> if SIDE = 'R', LWORK >= max(1,M) * MB. +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,NB*MB). +*> If SIDE = 'R', LWORK >= max(1,M*MB). +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -189,92 +192,103 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamswlq +*> * ===================================================================== SUBROUTINE ZLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - COMPLEX*16 A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + COMPLEX*16 A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME +* .. * .. External Subroutines .. - EXTERNAL ZTPMLQT, ZGEMLQT, XERBLA + EXTERNAL ZTPMLQT, ZGEMLQT, XERBLA * .. * .. Executable Statements .. * * Test the input arguments * - LQUERY = LWORK.LT.0 + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * MB ELSE LW = M * MB END IF * - INFO = 0 + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF +* IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( K.LT.0 ) THEN INFO = -5 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN INFO = -4 - ELSE IF( K.LT.MB .OR. MB.LT.1) THEN + ELSE IF( K.LT.MB .OR. MB.LT.1 ) THEN INFO = -6 ELSE IF( LDA.LT.MAX( 1, K ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, MB) ) THEN + ELSE IF( LDT.LT.MAX( 1, MB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN + END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZLAMSWLQ', -INFO ) - WORK(1) = LW RETURN - ELSE IF (LQUERY) THEN - WORK(1) = LW + ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * IF((NB.LE.K).OR.(NB.GE.MAX(M,N,K))) THEN CALL ZGEMLQT( SIDE, TRANS, M, N, K, MB, A, LDA, - $ T, LDT, C, LDC, WORK, INFO) + $ T, LDT, C, LDC, WORK, INFO ) RETURN END IF * @@ -403,7 +417,7 @@ * END IF * - WORK(1) = LW + WORK( 1 ) = LWMIN RETURN * * End of ZLAMSWLQ diff --git a/lapack-netlib/SRC/zlamtsqr.f b/lapack-netlib/SRC/zlamtsqr.f index 5030cb75f..03770c06e 100644 --- a/lapack-netlib/SRC/zlamtsqr.f +++ b/lapack-netlib/SRC/zlamtsqr.f @@ -128,22 +128,24 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) -*> +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,N*NB). +*> If SIDE = 'R', LWORK >= max(1,MB*NB). *> -*> If SIDE = 'L', LWORK >= max(1,N)*NB; -*> if SIDE = 'R', LWORK >= max(1,MB)*NB. *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -191,46 +193,50 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamtsqr +*> * ===================================================================== SUBROUTINE ZLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - COMPLEX*16 A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + COMPLEX*16 A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR, Q + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, Q, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME +* .. * .. External Subroutines .. - EXTERNAL ZGEMQRT, ZTPMQRT, XERBLA + EXTERNAL ZGEMQRT, ZTPMQRT, XERBLA * .. * .. Executable Statements .. * * Test the input arguments * - LQUERY = LWORK.LT.0 + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * NB Q = M ELSE @@ -238,11 +244,17 @@ Q = N END IF * - INFO = 0 + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF +* IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN @@ -253,38 +265,38 @@ INFO = -7 ELSE IF( LDA.LT.MAX( 1, Q ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, NB) ) THEN + ELSE IF( LDT.LT.MAX( 1, NB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * -* Determine the block size if it is tall skinny or short and wide -* - IF( INFO.EQ.0) THEN - WORK(1) = LW + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZLAMTSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF +* +* Determine the block size if it is tall skinny or short and wide * IF((MB.LE.K).OR.(MB.GE.MAX(M,N,K))) THEN CALL ZGEMQRT( SIDE, TRANS, M, N, K, NB, A, LDA, - $ T, LDT, C, LDC, WORK, INFO) + $ T, LDT, C, LDC, WORK, INFO ) RETURN - END IF + END IF * IF(LEFT.AND.NOTRAN) THEN * @@ -410,7 +422,7 @@ * END IF * - WORK(1) = LW + WORK( 1 ) = LWMIN RETURN * * End of ZLAMTSQR diff --git a/lapack-netlib/SRC/zlarfgp.f b/lapack-netlib/SRC/zlarfgp.f index 6c9efb04c..d54f2ea5d 100644 --- a/lapack-netlib/SRC/zlarfgp.f +++ b/lapack-netlib/SRC/zlarfgp.f @@ -148,33 +148,23 @@ ALPHR = DBLE( ALPHA ) ALPHI = DIMAG( ALPHA ) * - IF( XNORM.LE.EPS*ABS(ALPHA) ) THEN + IF( XNORM.LE.EPS*ABS(ALPHA) .AND. ALPHI.EQ.ZERO ) THEN * * H = [1-alpha/abs(alpha) 0; 0 I], sign chosen so ALPHA >= 0. * - IF( ALPHI.EQ.ZERO ) THEN - IF( ALPHR.GE.ZERO ) THEN -* When TAU.eq.ZERO, the vector is special-cased to be -* all zeros in the application routines. We do not need -* to clear it. - TAU = ZERO - ELSE -* However, the application routines rely on explicit -* zero checks when TAU.ne.ZERO, and we must clear X. - TAU = TWO - DO J = 1, N-1 - X( 1 + (J-1)*INCX ) = ZERO - END DO - ALPHA = -ALPHA - END IF + IF( ALPHR.GE.ZERO ) THEN +* When TAU.eq.ZERO, the vector is special-cased to be +* all zeros in the application routines. We do not need +* to clear it. + TAU = ZERO ELSE -* Only "reflecting" the diagonal entry to be real and non-negative. - XNORM = DLAPY2( ALPHR, ALPHI ) - TAU = DCMPLX( ONE - ALPHR / XNORM, -ALPHI / XNORM ) +* However, the application routines rely on explicit +* zero checks when TAU.ne.ZERO, and we must clear X. + TAU = TWO DO J = 1, N-1 X( 1 + (J-1)*INCX ) = ZERO END DO - ALPHA = XNORM + ALPHA = -ALPHA END IF ELSE * diff --git a/lapack-netlib/SRC/zlaswlq.f b/lapack-netlib/SRC/zlaswlq.f index be4c48539..735207132 100644 --- a/lapack-netlib/SRC/zlaswlq.f +++ b/lapack-netlib/SRC/zlaswlq.f @@ -96,22 +96,23 @@ *> The leading dimension of the array T. LDT >= MB. *> \endverbatim *> -*> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) -*> +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= MB*M. +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MB*M, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -159,33 +160,37 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup laswlq +*> * ===================================================================== SUBROUTINE ZLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, - $ INFO) + $ INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT + INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT * .. * .. Array Arguments .. - COMPLEX*16 A( LDA, * ), WORK( * ), T( LDT, *) + COMPLEX*16 A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, MINMN, LWMIN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME EXTERNAL LSAME +* .. * .. EXTERNAL SUBROUTINES .. EXTERNAL ZGELQT, ZTPLQT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -196,12 +201,19 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = M*MB + END IF * IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. N.LT.M ) THEN INFO = -2 - ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 )) THEN + ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 ) ) THEN INFO = -3 ELSE IF( NB.LE.0 ) THEN INFO = -4 @@ -209,60 +221,61 @@ INFO = -6 ELSE IF( LDT.LT.MB ) THEN INFO = -8 - ELSE IF( ( LWORK.LT.M*MB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = MB*M +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZLASWLQ', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The LQ Decomposition * - IF((M.GE.N).OR.(NB.LE.M).OR.(NB.GE.N)) THEN - CALL ZGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO) + IF( (M.GE.N) .OR. (NB.LE.M) .OR. (NB.GE.N) ) THEN + CALL ZGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO ) RETURN - END IF + END IF * - KK = MOD((N-M),(NB-M)) - II=N-KK+1 + KK = MOD((N-M),(NB-M)) + II = N-KK+1 * -* Compute the LQ factorization of the first block A(1:M,1:NB) +* Compute the LQ factorization of the first block A(1:M,1:NB) * - CALL ZGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO) - CTR = 1 + CALL ZGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO ) + CTR = 1 * - DO I = NB+1, II-NB+M , (NB-M) + DO I = NB+1, II-NB+M, (NB-M) * -* Compute the QR factorization of the current block A(1:M,I:I+NB-M) +* Compute the QR factorization of the current block A(1:M,I:I+NB-M) * - CALL ZTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), - $ LDA, T(1, CTR * M + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CALL ZTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), + $ LDA, T(1, CTR * M + 1), + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * * Compute the QR factorization of the last block A(1:M,II:N) * - IF (II.LE.N) THEN + IF( II.LE.N ) THEN CALL ZTPLQT( M, KK, 0, MB, A(1,1), LDA, A( 1, II ), - $ LDA, T(1, CTR * M + 1), LDT, - $ WORK, INFO ) - END IF + $ LDA, T(1, CTR * M + 1), LDT, + $ WORK, INFO ) + END IF * - WORK( 1 ) = M * MB + WORK( 1 ) = LWMIN RETURN * * End of ZLASWLQ diff --git a/lapack-netlib/SRC/zlatrs3.f b/lapack-netlib/SRC/zlatrs3.f index 231a17274..27eac839b 100644 --- a/lapack-netlib/SRC/zlatrs3.f +++ b/lapack-netlib/SRC/zlatrs3.f @@ -158,7 +158,11 @@ *> \endverbatim *> *> \param[in] LWORK +*> \verbatim *> LWORK is INTEGER +*> The dimension of the array WORK. +*> +*> If MIN(N,NRHS) = 0, LWORK >= 1, else *> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where *> NBA = (N + NB - 1)/NB and NB is the optimal block size. *> @@ -166,6 +170,7 @@ *> only calculates the optimal dimensions of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. +*> \endverbatim *> *> \param[out] INFO *> \verbatim @@ -182,7 +187,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERauxiliary +*> \ingroup latrs3 *> \par Further Details: * ===================== * \verbatim @@ -257,7 +262,7 @@ LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, - $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS, LWMIN DOUBLE PRECISION ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, $ SCAMIN, SMLNUM, TMAX * .. @@ -296,15 +301,24 @@ * row. WORK( I + KK * LDS ) is the scale factor of the vector * segment associated with the I-th block row and the KK-th vector * in the block column. +* LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) LDS = NBA +* * The second part stores upper bounds of the triangular A. There are * a total of NBA x NBA blocks, of which only the upper triangular * part or the lower triangular part is referenced. The upper bound of * the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). +* LANRM = NBA * NBA AWRK = LSCALE - WORK( 1 ) = LSCALE + LANRM +* + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = LSCALE + LANRM + END IF + WORK( 1 ) = LWMIN * * Test the input parameters. * @@ -326,7 +340,7 @@ INFO = -8 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -10 - ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.LWMIN ) THEN INFO = -14 END IF IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/zlatsqr.f b/lapack-netlib/SRC/zlatsqr.f index 8c938aebc..24d00f28a 100644 --- a/lapack-netlib/SRC/zlatsqr.f +++ b/lapack-netlib/SRC/zlatsqr.f @@ -101,15 +101,18 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= NB*N. +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= NB*N, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -161,33 +164,37 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup latsqr +*> * ===================================================================== SUBROUTINE ZLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, - $ LWORK, INFO) + $ LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK + INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK * .. * .. Array Arguments .. - COMPLEX*16 A( LDA, * ), WORK( * ), T(LDT, *) + COMPLEX*16 A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, LWMIN, MINMN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME EXTERNAL LSAME +* .. * .. EXTERNAL SUBROUTINES .. - EXTERNAL ZGEQRT, ZTPQRT, XERBLA + EXTERNAL ZGEQRT, ZTPQRT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -198,6 +205,13 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = N*NB + END IF * IF( M.LT.0 ) THEN INFO = -1 @@ -205,64 +219,65 @@ INFO = -2 ELSE IF( MB.LT.1 ) THEN INFO = -3 - ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 )) THEN + ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 ) ) THEN INFO = -4 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -6 ELSE IF( LDT.LT.NB ) THEN INFO = -8 - ELSE IF( LWORK.LT.(N*NB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = NB*N +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZLATSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The QR Decomposition * - IF ((MB.LE.N).OR.(MB.GE.M)) THEN - CALL ZGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO) - RETURN - END IF - KK = MOD((M-N),(MB-N)) - II=M-KK+1 + IF( (MB.LE.N) .OR. (MB.GE.M) ) THEN + CALL ZGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO ) + RETURN + END IF + KK = MOD((M-N),(MB-N)) + II = M-KK+1 * -* Compute the QR factorization of the first block A(1:MB,1:N) +* Compute the QR factorization of the first block A(1:MB,1:N) * - CALL ZGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) - CTR = 1 + CALL ZGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) + CTR = 1 * - DO I = MB+1, II-MB+N , (MB-N) + DO I = MB+1, II-MB+N, (MB-N) * -* Compute the QR factorization of the current block A(I:I+MB-N,1:N) +* Compute the QR factorization of the current block A(I:I+MB-N,1:N) * - CALL ZTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, - $ T(1, CTR * N + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CALL ZTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, + $ T(1, CTR * N + 1), + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * -* Compute the QR factorization of the last block A(II:M,1:N) +* Compute the QR factorization of the last block A(II:M,1:N) * - IF (II.LE.M) THEN - CALL ZTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, - $ T(1,CTR * N + 1), LDT, - $ WORK, INFO ) - END IF + IF( II.LE.M ) THEN + CALL ZTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, + $ T(1,CTR * N + 1), LDT, + $ WORK, INFO ) + END IF * - work( 1 ) = N*NB + WORK( 1 ) = LWMIN RETURN * * End of ZLATSQR diff --git a/lapack-netlib/TESTING/CMakeLists.txt b/lapack-netlib/TESTING/CMakeLists.txt index b4e2223f7..d4e6f970d 100644 --- a/lapack-netlib/TESTING/CMakeLists.txt +++ b/lapack-netlib/TESTING/CMakeLists.txt @@ -54,6 +54,9 @@ add_lapack_test(sgqr.out gqr.in xeigtsts) add_lapack_test(sgsv.out gsv.in xeigtsts) add_lapack_test(scsd.out csd.in xeigtsts) add_lapack_test(slse.out lse.in xeigtsts) +# +# ======== SINGLE DMD EIG TESTS =========================== +add_lapack_test(sdmd.out sdmd.in xdmdeigtsts) endif() if(BUILD_DOUBLE) @@ -85,6 +88,9 @@ add_lapack_test(dgqr.out gqr.in xeigtstd) add_lapack_test(dgsv.out gsv.in xeigtstd) add_lapack_test(dcsd.out csd.in xeigtstd) add_lapack_test(dlse.out lse.in xeigtstd) +# +# ======== DOUBLE DMD EIG TESTS =========================== +add_lapack_test(ddmd.out ddmd.in xdmdeigtstd) endif() if(BUILD_COMPLEX) @@ -114,6 +120,9 @@ add_lapack_test(cgqr.out gqr.in xeigtstc) add_lapack_test(cgsv.out gsv.in xeigtstc) add_lapack_test(ccsd.out csd.in xeigtstc) add_lapack_test(clse.out lse.in xeigtstc) +# +# ======== COMPLEX DMD EIG TESTS =========================== +add_lapack_test(cdmd.out cdmd.in xdmdeigtstc) endif() if(BUILD_COMPLEX16) @@ -145,6 +154,9 @@ add_lapack_test(zgqr.out gqr.in xeigtstz) add_lapack_test(zgsv.out gsv.in xeigtstz) add_lapack_test(zcsd.out csd.in xeigtstz) add_lapack_test(zlse.out lse.in xeigtstz) +# +# ======== COMPLEX16 DMD EIG TESTS =========================== +add_lapack_test(zdmd.out zdmd.in xdmdeigtstz) endif() diff --git a/lapack-netlib/TESTING/EIG/CMakeLists.txt b/lapack-netlib/TESTING/EIG/CMakeLists.txt index d252c7fa9..e7236677a 100644 --- a/lapack-netlib/TESTING/EIG/CMakeLists.txt +++ b/lapack-netlib/TESTING/EIG/CMakeLists.txt @@ -42,6 +42,8 @@ set(SEIGTST schkee.F sort03.f ssbt21.f ssgt01.f sslect.f sspt21.f sstt21.f sstt22.f ssyl01.f ssyt21.f ssyt22.f) +set(SDMDEIGTST schkdmd.f90) + set(CEIGTST cchkee.F cbdt01.f cbdt02.f cbdt03.f cbdt05.f cchkbb.f cchkbd.f cchkbk.f cchkbl.f cchkec.f @@ -59,6 +61,8 @@ set(CEIGTST cchkee.F csgt01.f cslect.f csyl01.f cstt21.f cstt22.f cunt01.f cunt03.f) +set(CDMDEIGTST cchkdmd.f90) + set(DZIGTST dlafts.f dlahd2.f dlasum.f dlatb9.f dstech.f dstect.f dsvdch.f dsvdct.f dsxt1.f) @@ -79,6 +83,8 @@ set(DEIGTST dchkee.F dort03.f dsbt21.f dsgt01.f dslect.f dspt21.f dstt21.f dstt22.f dsyl01.f dsyt21.f dsyt22.f) +set(DDMDEIGTST dchkdmd.f90) + set(ZEIGTST zchkee.F zbdt01.f zbdt02.f zbdt03.f zbdt05.f zchkbb.f zchkbd.f zchkbk.f zchkbl.f zchkec.f @@ -96,6 +102,8 @@ set(ZEIGTST zchkee.F zsgt01.f zslect.f zsyl01.f zstt21.f zstt22.f zunt01.f zunt03.f) +set(ZDMDEIGTST zchkdmd.f90) + macro(add_eig_executable name) add_executable(${name} ${ARGN}) target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE}) @@ -104,16 +112,20 @@ endmacro() if(BUILD_SINGLE) add_eig_executable(xeigtsts ${SEIGTST} ${SCIGTST} ${AEIGTST}) +add_eig_executable(xdmdeigtsts ${SDMDEIGTST}) endif() if(BUILD_COMPLEX) add_eig_executable(xeigtstc ${CEIGTST} ${SCIGTST} ${AEIGTST}) +add_eig_executable(xdmdeigtstc ${CDMDEIGTST}) endif() if(BUILD_DOUBLE) add_eig_executable(xeigtstd ${DEIGTST} ${DZIGTST} ${AEIGTST}) +add_eig_executable(xdmdeigtstd ${DDMDEIGTST}) endif() if(BUILD_COMPLEX16) add_eig_executable(xeigtstz ${ZEIGTST} ${DZIGTST} ${AEIGTST}) +add_eig_executable(xdmdeigtstz ${ZDMDEIGTST}) endif() diff --git a/lapack-netlib/TESTING/EIG/Makefile b/lapack-netlib/TESTING/EIG/Makefile index 4e7cf4629..9cf0fc95e 100644 --- a/lapack-netlib/TESTING/EIG/Makefile +++ b/lapack-netlib/TESTING/EIG/Makefile @@ -136,28 +136,28 @@ double: xeigtstd xdmdeigtstd complex16: xeigtstz xdmdeigtstz xdmdeigtsts: $(SDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ xdmdeigtstc: $(CDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ xdmdeigtstd: $(DDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ xdmdeigtstz: $(ZDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ xeigtstc: $(CEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ $(SDMDEIGTST): $(FRC) $(CDMDEIGTST): $(FRC) diff --git a/lapack-netlib/TESTING/EIG/cerrst.f b/lapack-netlib/TESTING/EIG/cerrst.f index 1748a2aad..d23eb14ea 100644 --- a/lapack-netlib/TESTING/EIG/cerrst.f +++ b/lapack-netlib/TESTING/EIG/cerrst.f @@ -748,17 +748,17 @@ CALL CHKXER( 'CHEEVR', INFOT, NOUT, LERR, OK ) INFOT = 18 CALL CHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ), + $ M, R, Z, 1, IW, Q, 0, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR', INFOT, NOUT, LERR, OK ) INFOT = 20 CALL CHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N-1, IW( 2*N-1 ), + $ M, R, Z, 1, IW, Q, 2*N, RW, 0, IW( 2*N-1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR', INFOT, NOUT, LERR, OK ) INFOT = 22 CALL CHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW, 10*N-1, + $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW, 0, $ INFO ) CALL CHKXER( 'CHEEVR', INFOT, NOUT, LERR, OK ) NT = NT + 12 @@ -830,19 +830,19 @@ INFOT = 18 CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, $ 0.0, 0.0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ), + $ M, R, Z, 1, IW, Q, 0, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 20 CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, $ 0.0, 0.0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 26*N, RW, 24*N-1, IW( 2*N-1 ), + $ M, R, Z, 1, IW, Q, 26*N, RW, 0, IW( 2*N-1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 22 CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, $ 0.0, 0.0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 10*N-1, + $ M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 0, $ INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) NT = NT + 13 diff --git a/lapack-netlib/TESTING/EIG/chkxer.f b/lapack-netlib/TESTING/EIG/chkxer.f index fd00bb65a..70caf7e0a 100644 --- a/lapack-netlib/TESTING/EIG/chkxer.f +++ b/lapack-netlib/TESTING/EIG/chkxer.f @@ -61,7 +61,7 @@ RETURN * 9999 FORMAT( ' *** Illegal value of parameter number ', I2, - $ ' not detected by ', A6, ' ***' ) + $ ' not detected by ', A, ' ***' ) * * End of CHKXER * diff --git a/lapack-netlib/TESTING/EIG/derrst.f b/lapack-netlib/TESTING/EIG/derrst.f index 059538644..7d111e2e0 100644 --- a/lapack-netlib/TESTING/EIG/derrst.f +++ b/lapack-netlib/TESTING/EIG/derrst.f @@ -735,12 +735,12 @@ CALL CHKXER( 'DSYEVR', INFOT, NOUT, LERR, OK ) INFOT = 18 CALL DSYEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 26*N-1, IW( 2*N+1 ), 10*N, + $ M, R, Z, 1, IW, Q, 0, IW( 2*N+1 ), 10*N, $ INFO ) CALL CHKXER( 'DSYEVR', INFOT, NOUT, LERR, OK ) INFOT = 20 CALL DSYEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 26*N, IW( 2*N+1 ), 10*N-1, + $ M, R, Z, 1, IW, Q, 26*N, IW( 2*N+1 ), 0, $ INFO ) CALL CHKXER( 'DSYEVR', INFOT, NOUT, LERR, OK ) NT = NT + 11 diff --git a/lapack-netlib/TESTING/EIG/serrst.f b/lapack-netlib/TESTING/EIG/serrst.f index b87fc42ef..408346382 100644 --- a/lapack-netlib/TESTING/EIG/serrst.f +++ b/lapack-netlib/TESTING/EIG/serrst.f @@ -733,12 +733,12 @@ CALL CHKXER( 'SSYEVR', INFOT, NOUT, LERR, OK ) INFOT = 18 CALL SSYEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 26*N-1, IW( 2*N+1 ), 10*N, + $ M, R, Z, 1, IW, Q, 0, IW( 2*N+1 ), 10*N, $ INFO ) CALL CHKXER( 'SSYEVR', INFOT, NOUT, LERR, OK ) INFOT = 20 CALL SSYEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 26*N, IW( 2*N+1 ), 10*N-1, + $ M, R, Z, 1, IW, Q, 26*N, IW( 2*N+1 ), 0, $ INFO ) CALL CHKXER( 'SSYEVR', INFOT, NOUT, LERR, OK ) NT = NT + 11 diff --git a/lapack-netlib/TESTING/EIG/zerrst.f b/lapack-netlib/TESTING/EIG/zerrst.f index d7b41c053..31881c4de 100644 --- a/lapack-netlib/TESTING/EIG/zerrst.f +++ b/lapack-netlib/TESTING/EIG/zerrst.f @@ -748,17 +748,17 @@ CALL CHKXER( 'ZHEEVR', INFOT, NOUT, LERR, OK ) INFOT = 18 CALL ZHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ), + $ M, R, Z, 1, IW, Q, 0, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'ZHEEVR', INFOT, NOUT, LERR, OK ) INFOT = 20 CALL ZHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N-1, IW( 2*N-1 ), + $ M, R, Z, 1, IW, Q, 2*N, RW, 0, IW( 2*N-1 ), $ 10*N, INFO ) CALL CHKXER( 'ZHEEVR', INFOT, NOUT, LERR, OK ) INFOT = 22 CALL ZHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW, 10*N-1, + $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW, 0, $ INFO ) CALL CHKXER( 'ZHEEVR', INFOT, NOUT, LERR, OK ) NT = NT + 12 @@ -830,19 +830,19 @@ INFOT = 18 CALL ZHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, $ 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ), + $ M, R, Z, 1, IW, Q, 0, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'ZHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 20 CALL ZHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, $ 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 26*N, RW, 24*N-1, IW( 2*N-1 ), + $ M, R, Z, 1, IW, Q, 26*N, RW, 0, IW( 2*N-1 ), $ 10*N, INFO ) CALL CHKXER( 'ZHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 22 CALL ZHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, $ 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 10*N-1, + $ M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 0, $ INFO ) CALL CHKXER( 'ZHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) NT = NT + 13 diff --git a/lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f index 30a61261f..d79978e55 100644 --- a/lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f @@ -433,9 +433,9 @@ * block factorization, LWORK is the length of AINV. * SRNAMT = 'CHETRF_AA_2STAGE' - LWORK = MIN(N*NB, 3*NMAX*NMAX) - CALL CHETRF_AA_2STAGE( UPLO, N, AFAC, LDA, - $ AINV, (3*NB+1)*N, + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX) + CALL CHETRF_AA_2STAGE( UPLO, N, AFAC, LDA, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ WORK, LWORK, $ INFO ) @@ -517,7 +517,6 @@ c NT = 1 CALL CLACPY( 'Full', N, NRHS, B, LDA, X, LDA ) * SRNAMT = 'CHETRS_AA_2STAGE' - LWORK = MAX( 1, 3*N-2 ) CALL CHETRS_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, $ AINV, (3*NB+1)*N, IWORK, IWORK( 1+N ), $ X, LDA, INFO ) diff --git a/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f index 51cef512d..83e8a17b0 100644 --- a/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f @@ -400,9 +400,9 @@ * Factor the matrix and solve the system using CHESV_AA. * SRNAMT = 'CHESV_AA_2STAGE ' - LWORK = MIN(N*NB, 3*NMAX*NMAX) + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX) CALL CHESV_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, - $ AINV, (3*NB+1)*N, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ X, LDA, WORK, LWORK, INFO ) * diff --git a/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f b/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f index bc4e77a5a..1940351a4 100644 --- a/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f @@ -421,9 +421,9 @@ * block factorization, LWORK is the length of AINV. * SRNAMT = 'DSYTRF_AA_2STAGE' - LWORK = MIN(N*NB, 3*NMAX*NMAX) + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX ) CALL DSYTRF_AA_2STAGE( UPLO, N, AFAC, LDA, - $ AINV, (3*NB+1)*N, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ WORK, LWORK, $ INFO ) @@ -503,7 +503,6 @@ c NT = 1 CALL DLACPY( 'Full', N, NRHS, B, LDA, X, LDA ) * SRNAMT = 'DSYTRS_AA_2STAGE' - LWORK = MAX( 1, 3*N-2 ) CALL DSYTRS_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, $ AINV, (3*NB+1)*N, IWORK, IWORK( 1+N ), $ X, LDA, INFO ) diff --git a/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f b/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f index 91c9e8e9a..d04106ae3 100644 --- a/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f @@ -400,9 +400,9 @@ * Factor the matrix and solve the system using DSYSV_AA. * SRNAMT = 'DSYSV_AA_2STAGE ' - LWORK = MIN(N*NB, 3*NMAX*NMAX) + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX ) CALL DSYSV_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, - $ AINV, (3*NB+1)*N, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ X, LDA, WORK, LWORK, INFO ) * diff --git a/lapack-netlib/TESTING/LIN/schksy_aa_2stage.f b/lapack-netlib/TESTING/LIN/schksy_aa_2stage.f index d3c27ae56..6490cd7c3 100644 --- a/lapack-netlib/TESTING/LIN/schksy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/schksy_aa_2stage.f @@ -423,9 +423,9 @@ * block factorization, LWORK is the length of AINV. * SRNAMT = 'SSYTRF_AA_2STAGE' - LWORK = MIN(N*NB, 3*NMAX*NMAX) + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX ) CALL SSYTRF_AA_2STAGE( UPLO, N, AFAC, LDA, - $ AINV, (3*NB+1)*N, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ WORK, LWORK, $ INFO ) @@ -505,7 +505,6 @@ CALL SLACPY( 'Full', N, NRHS, B, LDA, X, LDA ) * SRNAMT = 'SSYTRS_AA_2STAGE' - LWORK = MAX( 1, 3*N-2 ) CALL SSYTRS_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, $ AINV, (3*NB+1)*N, IWORK, IWORK( 1+N ), $ X, LDA, INFO ) diff --git a/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f b/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f index aff32bce9..319b90805 100644 --- a/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f @@ -400,9 +400,9 @@ * Factor the matrix and solve the system using SSYSV_AA. * SRNAMT = 'SSYSV_AA_2STAGE ' - LWORK = MIN(N*NB, 3*NMAX*NMAX) + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX ) CALL SSYSV_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, - $ AINV, (3*NB+1)*N, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ X, LDA, WORK, LWORK, INFO ) * diff --git a/lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f index 381fac9f2..51082f1d0 100644 --- a/lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f @@ -8,7 +8,7 @@ * Definition: * =========== * -* SUBROUTINE ZCHKHE_AA_2STAGE( DOTYPE, NN, NVAL, NNB, NBVAL, +* SUBROUTINE ZCHKHE_AA_2STAGE( DOTYPE, NN, NVAL, NNB, NBVAL, * NNS, NSVAL, THRESH, TSTERR, NMAX, A, * AFAC, AINV, B, X, XACT, WORK, RWORK, * IWORK, NOUT ) @@ -185,7 +185,8 @@ LOGICAL DOTYPE( * ) INTEGER IWORK( * ), NBVAL( * ), NSVAL( * ), NVAL( * ) COMPLEX*16 A( * ), AFAC( * ), AINV( * ), B( * ), - $ RWORK( * ), WORK( * ), X( * ), XACT( * ) + $ WORK( * ), X( * ), XACT( * ) + DOUBLE PRECISION RWORK( * ) * .. * * ===================================================================== @@ -430,9 +431,9 @@ * block factorization, LWORK is the length of AINV. * SRNAMT = 'ZHETRF_AA_2STAGE' - LWORK = MIN(N*NB, 3*NMAX*NMAX) - CALL ZHETRF_AA_2STAGE( UPLO, N, AFAC, LDA, - $ AINV, (3*NB+1)*N, + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX ) + CALL ZHETRF_AA_2STAGE( UPLO, N, AFAC, LDA, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ WORK, LWORK, $ INFO ) diff --git a/lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f index 9401867e0..fcd774491 100644 --- a/lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f @@ -400,9 +400,9 @@ * Factor the matrix and solve the system using ZHESV_AA. * SRNAMT = 'ZHESV_AA_2STAGE ' - LWORK = MIN(N*NB, 3*NMAX*NMAX) + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX ) CALL ZHESV_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, - $ AINV, (3*NB+1)*N, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ X, LDA, WORK, LWORK, INFO ) * diff --git a/lapack-netlib/TESTING/dgd.in b/lapack-netlib/TESTING/dgd.in index ca24dd2a0..127b63789 100644 --- a/lapack-netlib/TESTING/dgd.in +++ b/lapack-netlib/TESTING/dgd.in @@ -1,6 +1,6 @@ DGS Data for the Real Nonsymmetric Schur Form Driver 5 Number of matrix dimensions -2 6 10 12 20 30 Matrix dimensions +2 10 12 20 30 Matrix dimensions 1 1 1 2 1 Parameters NB, NBMIN, NXOVER, NS, NBCOL 10 Threshold for test ratios .TRUE. Put T to test the error exits diff --git a/lapack-netlib/TESTING/sgd.in b/lapack-netlib/TESTING/sgd.in index 164b3cc35..efab6c257 100644 --- a/lapack-netlib/TESTING/sgd.in +++ b/lapack-netlib/TESTING/sgd.in @@ -1,6 +1,6 @@ SGS Data for the Real Nonsymmetric Schur Form Driver 5 Number of matrix dimensions -2 6 10 12 20 30 Matrix dimensions +2 10 12 20 30 Matrix dimensions 1 1 1 2 1 Parameters NB, NBMIN, NXOVER, NS, NBCOL 10 Threshold for test ratios .TRUE. Put T to test the error exits diff --git a/lapack-netlib/lapack_testing.py b/lapack-netlib/lapack_testing.py index 96fbeb2a6..fea0ff765 100755 --- a/lapack-netlib/lapack_testing.py +++ b/lapack-netlib/lapack_testing.py @@ -255,19 +255,19 @@ for dtype in range_prec: else: if dtest==16: # LIN TESTS - cmdbase="xlintst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + cmdbase="LIN/xlintst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" elif dtest==17: # PROTO LIN TESTS - cmdbase="xlintst"+letter+dtypes[0][dtype-1]+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + cmdbase="LIN/xlintst"+letter+dtypes[0][dtype-1]+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" elif dtest==18: # PROTO LIN TESTS - cmdbase="xlintstrf"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + cmdbase="LIN/xlintstrf"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" elif dtest==20: # DMD EIG TESTS - cmdbase="xdmdeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + cmdbase="EIG/xdmdeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" else: # EIG TESTS - cmdbase="xeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + cmdbase="EIG/xeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" if not just_errors and not short_summary: print("Testing "+name+" "+dtests[1][dtest]+"-"+cmdbase, end=' ') # Run the process: either to read the file or run the LAPACK testing diff --git a/lapack/laswp/csky/Makefile b/lapack/laswp/csky/Makefile new file mode 100644 index 000000000..75411deb5 --- /dev/null +++ b/lapack/laswp/csky/Makefile @@ -0,0 +1,13 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +ifndef LASWP +LASWP = ../generic/laswp_k.c +endif + +ifndef ZLASWP +ZLASWP = ../generic/zlaswp_k.c +endif + +include ../generic/Makefile + diff --git a/param.h b/param.h index ee4640f57..5d2e960a2 100644 --- a/param.h +++ b/param.h @@ -2743,19 +2743,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL -#ifdef HAVE_MSA -#define SGEMM_DEFAULT_UNROLL_M 8 -#define SGEMM_DEFAULT_UNROLL_N 8 - -#define DGEMM_DEFAULT_UNROLL_M 8 -#define DGEMM_DEFAULT_UNROLL_N 4 - -#define CGEMM_DEFAULT_UNROLL_M 8 -#define CGEMM_DEFAULT_UNROLL_N 4 - -#define ZGEMM_DEFAULT_UNROLL_M 4 -#define ZGEMM_DEFAULT_UNROLL_N 4 -#else +#if defined(NO_MSA) #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2767,6 +2755,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 #endif #define SGEMM_DEFAULT_P 64 @@ -2853,13 +2853,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #define QGEMM_DEFAULT_UNROLL_N 2 -#define CGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 4 #define XGEMM_DEFAULT_UNROLL_N 1 #define QGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_M 1 -#define ZGEMM_DEFAULT_UNROLL_M 1 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 8 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_P 256 @@ -2888,13 +2888,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 8 -#define DGEMM_DEFAULT_UNROLL_M 2 -#define DGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 1 +#define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 4 -#define ZGEMM_DEFAULT_UNROLL_M 1 +#define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_P 128 @@ -2958,19 +2958,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN (BLASLONG) 0x03fffUL -#if defined(HAVE_MSA) -#define SGEMM_DEFAULT_UNROLL_M 8 -#define SGEMM_DEFAULT_UNROLL_N 8 - -#define DGEMM_DEFAULT_UNROLL_M 8 -#define DGEMM_DEFAULT_UNROLL_N 4 - -#define CGEMM_DEFAULT_UNROLL_M 8 -#define CGEMM_DEFAULT_UNROLL_N 4 - -#define ZGEMM_DEFAULT_UNROLL_M 4 -#define ZGEMM_DEFAULT_UNROLL_N 4 -#else +#if defined(NO_MSA) #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 2 @@ -2982,6 +2970,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 #endif #define SGEMM_DEFAULT_P 128 @@ -3041,6 +3041,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#if defined(x280) +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 16 // 4 // 16 // 2 +#define SGEMM_DEFAULT_UNROLL_N 8// 4 // 4 // 2 + +/* SGEMM_UNROLL_MN is calculated as max(SGEMM_UNROLL_M, SGEMM_UNROLL_N) + * Since we don't define SGEMM_UNROLL_M correctly we have to manually set this macro. + * If VLMAX size is ever more than 1024, this should be increased also. */ +#define SGEMM_DEFAULT_UNROLL_MN 32 + +#define DGEMM_DEFAULT_UNROLL_M 16 //2 // 8 +#define DGEMM_DEFAULT_UNROLL_N 8 //2 // 4 +#define DGEMM_DEFAULT_UNROLL_MN 32 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_MN 32 + +#define ZGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_MN 16 + +#define SGEMM_DEFAULT_P 160 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#define SYMV_P 16 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 + +#endif #ifdef C910V #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 @@ -3080,6 +3126,84 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#ifdef RISCV64_ZVL128B +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#define SYMV_P 16 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 + +#endif + +#ifdef RISCV64_ZVL256B +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 8 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 8 + +#define ZGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 64 +#define CGEMM_DEFAULT_P 64 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 128 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 128 +#define ZGEMM_DEFAULT_Q 64 + +#define SGEMM_DEFAULT_R 16384 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 8192 +#define ZGEMM_DEFAULT_R 4096 + +#define SYMV_P 16 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 + +#endif + #ifdef ARMV7 #define SNUMOPT 2 #define DNUMOPT 2 @@ -3359,13 +3483,13 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 -#define SGEMM_DEFAULT_P 128 -#define DGEMM_DEFAULT_P 160 +#define SGEMM_DEFAULT_P 240 +#define DGEMM_DEFAULT_P 240 #define CGEMM_DEFAULT_P 128 #define ZGEMM_DEFAULT_P 128 -#define SGEMM_DEFAULT_Q 352 -#define DGEMM_DEFAULT_Q 128 +#define SGEMM_DEFAULT_Q 640 +#define DGEMM_DEFAULT_Q 320 #define CGEMM_DEFAULT_Q 224 #define ZGEMM_DEFAULT_Q 112 @@ -3396,13 +3520,13 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define ZGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_MN 16 -#define SGEMM_DEFAULT_P 128 -#define DGEMM_DEFAULT_P 160 +#define SGEMM_DEFAULT_P 240 +#define DGEMM_DEFAULT_P 240 #define CGEMM_DEFAULT_P 128 #define ZGEMM_DEFAULT_P 128 -#define SGEMM_DEFAULT_Q 352 -#define DGEMM_DEFAULT_Q 128 +#define SGEMM_DEFAULT_Q 640 +#define DGEMM_DEFAULT_Q 320 #define CGEMM_DEFAULT_Q 224 #define ZGEMM_DEFAULT_Q 112 @@ -3807,7 +3931,44 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #define SYMV_P 16 #endif +#if defined(CSKY) || defined(CK860FV) +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 2 + +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 2 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#define SYMV_P 16 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 + +#endif #ifdef GENERIC diff --git a/test/Makefile b/test/Makefile index 56acf1c5b..5a4694ce6 100644 --- a/test/Makefile +++ b/test/Makefile @@ -276,6 +276,9 @@ ifeq ($(F_COMPILER), IBM) ifeq ($(C_COMPILER), GCC) CEXTRALIB += -lgomp endif +ifeq ($(C_COMPILER), CLANG) +CEXTRALIB += -lomp +endif endif endif diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c index cf808b56d..bc74233ab 100644 --- a/test/compare_sgemm_sbgemm.c +++ b/test/compare_sgemm_sbgemm.c @@ -100,6 +100,8 @@ main (int argc, char *argv[]) float C[m * n]; bfloat16_bits AA[m * k], BB[k * n]; float DD[m * n], CC[m * n]; + bfloat16 atmp,btmp; + blasint one=1; for (j = 0; j < m; j++) { @@ -108,16 +110,18 @@ main (int argc, char *argv[]) A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; C[j * k + i] = 0; - AA[j * k + i].v = *(uint32_t *) & A[j * k + i] >> 16; - BB[j * k + i].v = *(uint32_t *) & B[j * k + i] >> 16; + sbstobf16_(&one, &A[j*k+i], &one, &atmp, &one); + sbstobf16_(&one, &B[j*k+i], &one, &btmp, &one); + AA[j * k + i].v = atmp; + BB[j * k + i].v = btmp; CC[j * k + i] = 0; DD[j * k + i] = 0; } } SGEMM (&transA, &transB, &m, &n, &k, &alpha, A, &m, B, &k, &beta, C, &m); - SBGEMM (&transA, &transB, &m, &n, &k, &alpha, AA, - &m, BB, &k, &beta, CC, &m); + SBGEMM (&transA, &transB, &m, &n, &k, &alpha, (bfloat16*) AA, + &m, (bfloat16*)BB, &k, &beta, CC, &m); for (i = 0; i < n; i++) for (j = 0; j < m; j++) if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0) diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 2e32827d3..edfcfb7cf 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -15,6 +15,9 @@ else () test_dsdot.c test_dnrm2.c test_swap.c + test_zscal.c + test_amin.c + test_axpby.c ) endif () diff --git a/utest/Makefile b/utest/Makefile index f99035440..c42496fb3 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -11,7 +11,8 @@ UTESTBIN=openblas_utest include $(TOPDIR)/Makefile.system -OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o +OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o test_zscal.o \ + test_amin.o test_axpby.o #test_rot.o test_swap.o test_axpy.o test_dotu.o test_dsdot.o test_fork.o ifneq ($(NO_LAPACK), 1) @@ -44,8 +45,18 @@ endif all : run_test +ifeq ($(OSNAME), AIX) +ifeq ($(USE_OPENMP), 1) +$(UTESTBIN): $(OBJS) + $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) +else +$(UTESTBIN): $(OBJS) + $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) +endif +else $(UTESTBIN): $(OBJS) $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) +endif run_test: $(UTESTBIN) ifneq ($(CROSS), 1) diff --git a/utest/test_amax.c b/utest/test_amax.c index a9e5a1c85..e9775caf0 100644 --- a/utest/test_amax.c +++ b/utest/test_amax.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011-2016, The OpenBLAS Project +Copyright (c) 2011-2024, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,9 +13,9 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the OpenBLAS project nor the names of - its contributors may be used to endorse or promote products - derived from this software without specific prior written + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" @@ -57,4 +57,31 @@ CTEST(amax, damax){ ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS); } #endif +#ifdef BUILD_COMPLEX +CTEST(amax, scamax){ + blasint N = 9, inc = 1; + float te_max = 0.0, tr_max = 0.0; + float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, + -7.7, 8.8 }; + te_max = BLASFUNC(scamax)(&N, x, &inc); + tr_max = 20.0; + + ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); +} +#endif +#ifdef BUILD_COMPLEX16 +CTEST(amax, dzamax){ + blasint N = 9, inc = 1; + double te_max = 0.0, tr_max = 0.0; + double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, + -7.7, 8.8 }; + + te_max = BLASFUNC(dzamax)(&N, x, &inc); + tr_max = 20.0; + + ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS); +} +#endif diff --git a/utest/test_amin.c b/utest/test_amin.c new file mode 100644 index 000000000..1305ab8ef --- /dev/null +++ b/utest/test_amin.c @@ -0,0 +1,89 @@ +/***************************************************************************** +Copyright (c) 2011-2024, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "openblas_utest.h" + +#ifdef BUILD_SINGLE +CTEST(amin, samin){ + blasint N = 3, inc = 1; + float te_min = 0.0, tr_min = 0.0; + float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9 }; + + te_min = BLASFUNC(samin)(&N, x, &inc); + tr_min = 1.1; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS); +} +#endif +#ifdef BUILD_DOUBLE +CTEST(amin, damin){ + blasint N = 3, inc = 1; + double te_min = 0.0, tr_min = 0.0; + double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9 }; + + te_min = BLASFUNC(damin)(&N, x, &inc); + tr_min = 1.1; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), DOUBLE_EPS); +} +#endif +#ifdef BUILD_COMPLEX +CTEST(amin, scamin){ + blasint N = 9, inc = 1; + float te_min = 0.0, tr_min = 0.0; + float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, + -7.7, 8.8 }; + + te_min = BLASFUNC(scamin)(&N, x, &inc); + tr_min = 3.3; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS); +} +#endif +#ifdef BUILD_COMPLEX16 +CTEST(amin, dzamin){ + blasint N = 9, inc = 1; + double te_min = 0.0, tr_min = 0.0; + double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, + -7.7, 8.8 }; + + te_min = BLASFUNC(dzamin)(&N, x, &inc); + tr_min = 3.3; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), DOUBLE_EPS); +} +#endif diff --git a/utest/test_axpby.c b/utest/test_axpby.c new file mode 100644 index 000000000..37ba8ad14 --- /dev/null +++ b/utest/test_axpby.c @@ -0,0 +1,320 @@ +/***************************************************************************** +Copyright (c) 2011-2024, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "openblas_utest.h" + +#ifdef BUILD_SINGLE +CTEST(axpby, saxpby_inc_0) +{ + blasint i; + blasint N = 9, incX = 0, incY = 0; + float alpha = 1.0, beta = 2.0; + float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(saxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY); + + float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y2[] = { 1535.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + for(i = 0; i < N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); + } +} + +CTEST(axpby, saxpby_inc_1) +{ + blasint i; + blasint N = 9, incX = 1, incY = 1; + float alpha = 0.25, beta = 0.75; + float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(saxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY); + + float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y2[] = { 1.75, 3.75, 5.75, 7.75, 1.75, 3.75, 5.75, 7.75, 9.75 }; + + for(i = 0; i < N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); + } +} + +CTEST(axpby, saxpby_inc_2) +{ + blasint i; + blasint N = 9, incX = 2, incY = 2; + float alpha = 0.25, beta = 0.75; + float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(saxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY); + + float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y2[] = { 1.75, 4.00, 5.75, 8.00, 1.75, 4.00, 5.75, 8.00, + 9.75, 2.00, 3.75, 6.00, 7.75, 2.00, 3.75, 6.00, + 7.75, 10.00 }; + + for(i = 0; i < 2 * N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); + } +} +#endif + +#ifdef BUILD_DOUBLE +CTEST(axpby, daxpby_inc_0) +{ + blasint i; + blasint N = 9, incX = 0, incY = 0; + double alpha = 1.0, beta = 2.0; + double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(daxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY); + + double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y2[] = { 1535.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + for(i = 0; i < N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); + } +} + +CTEST(axpby, daxpby_inc_1) +{ + blasint i; + blasint N = 9, incX = 1, incY = 1; + double alpha = 0.25, beta = 0.75; + double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(daxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY); + + double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y2[] = { 1.75, 3.75, 5.75, 7.75, 1.75, 3.75, 5.75, 7.75, 9.75 }; + + for(i = 0; i < N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); + } +} + +CTEST(axpby, daxpby_inc_2) +{ + blasint i; + blasint N = 9, incX = 2, incY = 2; + double alpha = 0.25, beta = 0.75; + double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(daxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY); + + double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y2[] = { 1.75, 4.00, 5.75, 8.00, 1.75, 4.00, 5.75, 8.00, + 9.75, 2.00, 3.75, 6.00, 7.75, 2.00, 3.75, 6.00, + 7.75, 10.00 }; + + for(i = 0; i < 2 * N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); + } +} +#endif + +#ifdef BUILD_COMPLEX +CTEST(axpby, caxpby_inc_0) +{ + blasint i; + blasint N = 9, incX = 0, incY = 0; + float alpha[] = { 1.0, 2.0 }, beta[] = { 2.0, 1.0 }; + float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(caxpby)(&N, alpha, x1, &incX, beta, y1, &incY); + + float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y2[] = { 9355.0, -8865.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, + 10.0, 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + for(i = 0; i < 2 * N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); + } +} + +CTEST(axpby, caxpby_inc_1) +{ + blasint i; + blasint N = 9, incX = 1, incY = 1; + float alpha[] = { 0.25, 0.25 }, beta[] = { 0.75, 0.75 }; + float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(caxpby)(&N, alpha, x1, &incX, beta, y1, &incY); + + float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y2[] = { -2.0, 5.5, -2.0, 13.5, -2.0, 5.5, -2.0, 13.5, + 8.0, 11.5, -2.0, 9.5, 6.0, 9.5, -2.0, 9.5, -2.0, 17.5 }; + + for(i = 0; i < 2 * N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); + } +} + +CTEST(axpby, caxpby_inc_2) +{ + blasint i; + blasint N = 9, incX = 2, incY = 2; + float alpha[] = { 0.25, 0.25 }, beta[] = { 0.75, 0.75 }; + float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(caxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY); + + float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y2[] = { -2.0, 5.5, 6.0, 8.0, -2.0, 5.5, 6.0, 8.0, 8.0, + 11.5, 4.0, 6.0, 6.0, 9.5, 4.0, 6.0, -2.0, 17.5, + 2.0, 4.0, -2.0, 13.5, 2.0, 4.0, -2.0, 13.5, 10.0, + 2.0, -2.0, 9.5, 8.0, 2.0, -2.0, 9.5, 8.0, 10.0 }; + + for(i = 0; i < 4 * N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); + } +} +#endif + +#ifdef BUILD_COMPLEX16 +CTEST(axpby, zaxpby_inc_0) +{ + blasint i; + blasint N = 9, incX = 0, incY = 0; + double alpha[] = { 1.0, 2.0 }, beta[] = { 2.0, 1.0 }; + double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(zaxpby)(&N, alpha, x1, &incX, beta, y1, &incY); + + double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y2[] = { 9355.0, -8865.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, + 10.0, 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + for(i = 0; i < 2 * N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); + } +} + +CTEST(axpby, zaxpby_inc_1) +{ + blasint i; + blasint N = 9, incX = 1, incY = 1; + double alpha[] = { 0.25, 0.25 }, beta[] = { 0.75, 0.75 }; + double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(zaxpby)(&N, alpha, x1, &incX, beta, y1, &incY); + + double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y2[] = { -2.0, 5.5, -2.0, 13.5, -2.0, 5.5, -2.0, 13.5, + 8.0, 11.5, -2.0, 9.5, 6.0, 9.5, -2.0, 9.5, -2.0, 17.5 }; + + for(i = 0; i < 2 * N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); + } +} + +CTEST(axpby, zaxpby_inc_2) +{ + blasint i; + blasint N = 9, incX = 2, incY = 2; + double alpha[] = { 0.25, 0.25 }, beta[] = { 0.75, 0.75 }; + double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(zaxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY); + + double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y2[] = { -2.0, 5.5, 6.0, 8.0, -2.0, 5.5, 6.0, 8.0, 8.0, + 11.5, 4.0, 6.0, 6.0, 9.5, 4.0, 6.0, -2.0, 17.5, + 2.0, 4.0, -2.0, 13.5, 2.0, 4.0, -2.0, 13.5, 10.0, + 2.0, -2.0, 9.5, 8.0, 2.0, -2.0, 9.5, 8.0, 10.0 }; + + for(i = 0; i < 4 * N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); + } +} +#endif diff --git a/utest/test_zscal.c b/utest/test_zscal.c new file mode 100644 index 000000000..ffc851e8b --- /dev/null +++ b/utest/test_zscal.c @@ -0,0 +1,104 @@ +#include "openblas_utest.h" +#include +#ifdef BUILD_COMPLEX16 + +#ifndef NAN +#define NAN 0.0/0.0 +#endif +#ifndef INFINITY +#define INFINITY 1.0/0.0 +#endif + +CTEST(zscal, i_nan) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + cblas_zscal(9, i, &nan, 1); + ASSERT_TRUE(isnan(nan[0])); + ASSERT_TRUE(isnan(nan[1])); + ASSERT_TRUE(isnan(nan[16])); + ASSERT_TRUE(isnan(nan[17])); +} + +CTEST(zscal, i_nan_inc_2) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, + NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + cblas_zscal(9, i, &nan, 2); + ASSERT_TRUE(isnan(nan[0])); + ASSERT_TRUE(isnan(nan[1])); + ASSERT_TRUE(isnan(nan[16])); + ASSERT_TRUE(isnan(nan[17])); +} + +CTEST(zscal, nan_i) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + cblas_zscal(9, &nan, &i, 1); + ASSERT_TRUE(isnan(i[0])); + ASSERT_TRUE(isnan(i[1])); + ASSERT_TRUE(isnan(i[16])); + ASSERT_TRUE(isnan(i[17])); +} + +CTEST(zscal, nan_i_inc_2) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, + 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + cblas_zscal(9, &nan, &i, 2); + ASSERT_TRUE(isnan(i[0])); + ASSERT_TRUE(isnan(i[1])); + ASSERT_TRUE(isnan(i[16])); + ASSERT_TRUE(isnan(i[17])); +} + +CTEST(zscal, i_inf) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; + cblas_zscal(9, i, &inf, 1); + ASSERT_TRUE(isnan(inf[0])); + ASSERT_TRUE(isinf(inf[1])); + ASSERT_TRUE(isnan(inf[16])); + ASSERT_TRUE(isinf(inf[17])); +} + +CTEST(zscal, i_inf_inc_2) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, + INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; + cblas_zscal(9, i, &inf, 2); + ASSERT_TRUE(isnan(inf[0])); + ASSERT_TRUE(isinf(inf[1])); + ASSERT_TRUE(isnan(inf[16])); + ASSERT_TRUE(isinf(inf[17])); +} + +CTEST(zscal, inf_i) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; + cblas_zscal(9, &inf, &i, 1); + ASSERT_TRUE(isnan(i[0])); + ASSERT_TRUE(isinf(i[1])); + ASSERT_TRUE(isnan(i[16])); + ASSERT_TRUE(isinf(i[17])); +} + +CTEST(zscal, inf_i_inc_2) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, + 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; + cblas_zscal(9, &inf, &i, 2); + ASSERT_TRUE(isnan(i[0])); + ASSERT_TRUE(isinf(i[1])); + ASSERT_TRUE(isnan(i[16])); + ASSERT_TRUE(isinf(i[17])); +} + +#endif diff --git a/utest/utest_main2.c b/utest/utest_main2.c index 4382bf159..46a3b067d 100644 --- a/utest/utest_main2.c +++ b/utest/utest_main2.c @@ -38,6 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CTEST_SEGFAULT #define CTEST_ADD_TESTS_MANUALLY +#include "cblas.h" #include "openblas_utest.h" CTEST(amax, samax){ @@ -617,6 +618,51 @@ CTEST(max, smax_zero){ ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); } + +CTEST(zscal, i_nan) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + cblas_zscal(9, i, &nan, 1); + ASSERT_TRUE(isnan(nan[0])); + ASSERT_TRUE(isnan(nan[1])); + ASSERT_TRUE(isnan(nan[16])); + ASSERT_TRUE(isnan(nan[17])); +} + +CTEST(zscal, nan_i) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + cblas_zscal(9, &nan, &i, 1); + ASSERT_TRUE(isnan(i[0])); + ASSERT_TRUE(isnan(i[1])); + ASSERT_TRUE(isnan(i[16])); + ASSERT_TRUE(isnan(i[17])); + } + +CTEST(zscal, i_inf) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; + cblas_zscal(9, i, &inf, 1); + ASSERT_TRUE(isnan(inf[0])); + ASSERT_TRUE(isinf(inf[1])); + ASSERT_TRUE(isnan(inf[16])); + ASSERT_TRUE(isinf(inf[17])); +} + +CTEST(zscal, inf_i) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; + cblas_zscal(9, &inf, &i, 1); + ASSERT_TRUE(isnan(i[0])); + ASSERT_TRUE(isinf(i[1])); + ASSERT_TRUE(isnan(i[16])); + ASSERT_TRUE(isinf(i[17])); +} + int main(int argc, const char ** argv){ CTEST_ADD (amax, samax); @@ -648,7 +694,10 @@ int main(int argc, const char ** argv){ CTEST_ADD (swap,zswap_inc_0); CTEST_ADD (swap,sswap_inc_0); CTEST_ADD (swap,cswap_inc_0); - + CTEST_ADD (zscal, i_nan); + CTEST_ADD (zscal, nan_i); + CTEST_ADD (zscal, i_inf); + CTEST_ADD (zscal, inf_i); int num_fail=0; num_fail=ctest_main(argc, argv);